Upload 10 files
Browse files- README.md +4 -12
- app.py +49 -0
- cluster_outliers.csv +0 -0
- main.ipynb +1008 -0
- onnx_model/config.json +27 -0
- onnx_model/model.onnx +3 -0
- outlier_detection.ipynb +2292 -0
- pycaret_outlier_detection.ipynb +0 -0
- requirements.txt +10 -0
- trainer.ipynb +1165 -0
README.md
CHANGED
@@ -1,13 +1,5 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
colorTo: red
|
6 |
-
sdk: gradio
|
7 |
-
sdk_version: 4.12.0
|
8 |
-
app_file: app.py
|
9 |
-
pinned: false
|
10 |
-
license: unknown
|
11 |
-
---
|
12 |
|
13 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
+
# sin-kaf
|
2 |
+
# dataset link (Turkis)
|
3 |
+
# https://sites.google.com/site/offensevalsharedtask/more-datasets
|
4 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
|
|
app.py
ADDED
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gra
|
2 |
+
import torch
|
3 |
+
import numpy as np
|
4 |
+
from transformers import AutoModelForSequenceClassification
|
5 |
+
from transformers import AutoTokenizer
|
6 |
+
from optimum.onnxruntime import ORTModel
|
7 |
+
import onnxruntime as rt
|
8 |
+
|
9 |
+
|
10 |
+
ort_session = rt.InferenceSession("/DATA/sin-kaf/onnx_model/model.onnx")
|
11 |
+
ort_session.get_providers()
|
12 |
+
|
13 |
+
# model = ORTModel.load_model("/DATA/sin-kaf/onnx_model/model.onnx")
|
14 |
+
# model = AutoModelForSequenceClassification.from_pretrained('/DATA/sin-kaf/test_trainer/checkpoint-18500')
|
15 |
+
tokenizer = AutoTokenizer.from_pretrained("Overfit-GM/distilbert-base-turkish-cased-offensive")
|
16 |
+
|
17 |
+
def user_greeting(sent):
|
18 |
+
|
19 |
+
encoded_dict = tokenizer.encode_plus(
|
20 |
+
sent,
|
21 |
+
add_special_tokens = True,
|
22 |
+
max_length = 64,
|
23 |
+
pad_to_max_length = True,
|
24 |
+
return_attention_mask = True,
|
25 |
+
return_tensors = 'pt',
|
26 |
+
)
|
27 |
+
|
28 |
+
|
29 |
+
input_ids = encoded_dict['input_ids']
|
30 |
+
attention_masks = encoded_dict['attention_mask']
|
31 |
+
|
32 |
+
|
33 |
+
input_ids = torch.cat([input_ids], dim=0)
|
34 |
+
input_mask = torch.cat([attention_masks], dim=0)
|
35 |
+
|
36 |
+
input_feed = {
|
37 |
+
"input_ids": input_ids.tolist(),
|
38 |
+
"attention_mask":input_mask.tolist(),
|
39 |
+
}
|
40 |
+
output = ort_session.run(None, input_feed)
|
41 |
+
return np.argmax((output[0][0]))
|
42 |
+
# outputs = model(input_ids, input_mask)
|
43 |
+
# return torch.argmax(outputs['logits'])
|
44 |
+
|
45 |
+
|
46 |
+
|
47 |
+
app = gra.Interface(fn = user_greeting, inputs="text", outputs="text")
|
48 |
+
app.launch()
|
49 |
+
# app.launch(server_name="0.0.0.0")
|
cluster_outliers.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
main.ipynb
ADDED
@@ -0,0 +1,1008 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": 1,
|
6 |
+
"metadata": {},
|
7 |
+
"outputs": [],
|
8 |
+
"source": [
|
9 |
+
"# dataset link (Turkis)\n",
|
10 |
+
"# https://sites.google.com/site/offensevalsharedtask/more-datasets"
|
11 |
+
]
|
12 |
+
},
|
13 |
+
{
|
14 |
+
"cell_type": "code",
|
15 |
+
"execution_count": 1,
|
16 |
+
"metadata": {},
|
17 |
+
"outputs": [
|
18 |
+
{
|
19 |
+
"name": "stderr",
|
20 |
+
"output_type": "stream",
|
21 |
+
"text": [
|
22 |
+
"/home/sebit/anaconda3/envs/dl_env/lib/python3.9/site-packages/neptune/internal/backends/hosted_client.py:51: NeptuneDeprecationWarning: The 'neptune-client' package has been deprecated and will be removed in the future. Install the 'neptune' package instead. For more, see https://docs.neptune.ai/setup/upgrading/\n",
|
23 |
+
" from neptune.version import version as neptune_client_version\n",
|
24 |
+
"/home/sebit/anaconda3/envs/dl_env/lib/python3.9/site-packages/pytorch_lightning/loggers/neptune.py:39: NeptuneDeprecationWarning: You're importing the Neptune client library via the deprecated `neptune.new` module, which will be removed in a future release. Import directly from `neptune` instead.\n",
|
25 |
+
" from neptune import new as neptune\n"
|
26 |
+
]
|
27 |
+
}
|
28 |
+
],
|
29 |
+
"source": [
|
30 |
+
"import os\n",
|
31 |
+
"import numpy as np\n",
|
32 |
+
"import pandas as pd\n",
|
33 |
+
"import pytorch_lightning as pl\n",
|
34 |
+
"import random\n",
|
35 |
+
"import torch\n",
|
36 |
+
"import emoji\n",
|
37 |
+
"\n",
|
38 |
+
"\n",
|
39 |
+
"import datetime\n",
|
40 |
+
"import numpy as np\n",
|
41 |
+
"import torch.optim as optim\n",
|
42 |
+
"\n",
|
43 |
+
"\n",
|
44 |
+
"import torch.nn as nn\n",
|
45 |
+
"\n",
|
46 |
+
"from torch.utils.data import DataLoader,Dataset,random_split,TensorDataset ,RandomSampler, SequentialSampler\n",
|
47 |
+
"from torchmetrics import Accuracy, F1Score \n",
|
48 |
+
"from sklearn.preprocessing import LabelEncoder\n",
|
49 |
+
"from pytorch_lightning.callbacks import EarlyStopping,ModelCheckpoint\n",
|
50 |
+
"from pytorch_lightning.loggers import TensorBoardLogger,MLFlowLogger\n",
|
51 |
+
"from sklearn.model_selection import train_test_split\n",
|
52 |
+
"\n",
|
53 |
+
"from sklearn.preprocessing import LabelEncoder\n",
|
54 |
+
"from transformers import BertForSequenceClassification, AdamW, BertConfig,BertTokenizer,get_linear_schedule_with_warmup"
|
55 |
+
]
|
56 |
+
},
|
57 |
+
{
|
58 |
+
"cell_type": "code",
|
59 |
+
"execution_count": 2,
|
60 |
+
"metadata": {},
|
61 |
+
"outputs": [
|
62 |
+
{
|
63 |
+
"data": {
|
64 |
+
"text/plain": [
|
65 |
+
"device(type='cuda', index=0)"
|
66 |
+
]
|
67 |
+
},
|
68 |
+
"execution_count": 2,
|
69 |
+
"metadata": {},
|
70 |
+
"output_type": "execute_result"
|
71 |
+
}
|
72 |
+
],
|
73 |
+
"source": [
|
74 |
+
"device = torch.device(\"cuda:0\" if torch.cuda.is_available() else \"cpu\")\n",
|
75 |
+
"device"
|
76 |
+
]
|
77 |
+
},
|
78 |
+
{
|
79 |
+
"cell_type": "code",
|
80 |
+
"execution_count": 3,
|
81 |
+
"metadata": {},
|
82 |
+
"outputs": [
|
83 |
+
{
|
84 |
+
"data": {
|
85 |
+
"text/plain": [
|
86 |
+
"True"
|
87 |
+
]
|
88 |
+
},
|
89 |
+
"execution_count": 3,
|
90 |
+
"metadata": {},
|
91 |
+
"output_type": "execute_result"
|
92 |
+
}
|
93 |
+
],
|
94 |
+
"source": [
|
95 |
+
"torch.cuda.is_available()"
|
96 |
+
]
|
97 |
+
},
|
98 |
+
{
|
99 |
+
"cell_type": "code",
|
100 |
+
"execution_count": 4,
|
101 |
+
"metadata": {},
|
102 |
+
"outputs": [],
|
103 |
+
"source": [
|
104 |
+
"seed_val = 42\n",
|
105 |
+
"random.seed(seed_val)\n",
|
106 |
+
"np.random.seed(seed_val)\n",
|
107 |
+
"torch.manual_seed(seed_val)\n",
|
108 |
+
"torch.cuda.manual_seed_all(seed_val)"
|
109 |
+
]
|
110 |
+
},
|
111 |
+
{
|
112 |
+
"attachments": {},
|
113 |
+
"cell_type": "markdown",
|
114 |
+
"metadata": {},
|
115 |
+
"source": [
|
116 |
+
"# load dataaset\n"
|
117 |
+
]
|
118 |
+
},
|
119 |
+
{
|
120 |
+
"cell_type": "code",
|
121 |
+
"execution_count": 5,
|
122 |
+
"metadata": {},
|
123 |
+
"outputs": [],
|
124 |
+
"source": [
|
125 |
+
"# train_df=pd.read_csv('SemEval-2020 dataset/offenseval2020-turkish/offenseval2020-turkish/offenseval-tr-training-v1/offenseval-tr-training-v1.tsv',sep='\\t')\n",
|
126 |
+
"# test_df=pd.read_csv('SemEval-2020 dataset/offenseval2020-turkish/offenseval2020-turkish/offenseval-tr-testset-v1/offenseval-tr-testset-v1.tsv',sep='\\t')"
|
127 |
+
]
|
128 |
+
},
|
129 |
+
{
|
130 |
+
"cell_type": "code",
|
131 |
+
"execution_count": 6,
|
132 |
+
"metadata": {},
|
133 |
+
"outputs": [
|
134 |
+
{
|
135 |
+
"ename": "NameError",
|
136 |
+
"evalue": "name 'train_df' is not defined",
|
137 |
+
"output_type": "error",
|
138 |
+
"traceback": [
|
139 |
+
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
140 |
+
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
|
141 |
+
"Cell \u001b[0;32mIn[6], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m train_df\u001b[39m=\u001b[39mpd\u001b[39m.\u001b[39mconcat([train_df,test_df], axis\u001b[39m=\u001b[39m\u001b[39m0\u001b[39m)\n\u001b[1;32m 2\u001b[0m train_df\u001b[39m=\u001b[39mtrain_df\u001b[39m.\u001b[39mdrop([\u001b[39m'\u001b[39m\u001b[39mid\u001b[39m\u001b[39m'\u001b[39m], axis\u001b[39m=\u001b[39m\u001b[39m1\u001b[39m)\n",
|
142 |
+
"\u001b[0;31mNameError\u001b[0m: name 'train_df' is not defined"
|
143 |
+
]
|
144 |
+
}
|
145 |
+
],
|
146 |
+
"source": [
|
147 |
+
"train_df=pd.concat([train_df,test_df], axis=0)\n",
|
148 |
+
"train_df=train_df.drop(['id'], axis=1)"
|
149 |
+
]
|
150 |
+
},
|
151 |
+
{
|
152 |
+
"cell_type": "code",
|
153 |
+
"execution_count": null,
|
154 |
+
"metadata": {},
|
155 |
+
"outputs": [
|
156 |
+
{
|
157 |
+
"data": {
|
158 |
+
"text/plain": [
|
159 |
+
"subtask_a\n",
|
160 |
+
"NOT 25231\n",
|
161 |
+
"OFF 6046\n",
|
162 |
+
"Name: count, dtype: int64"
|
163 |
+
]
|
164 |
+
},
|
165 |
+
"execution_count": 7,
|
166 |
+
"metadata": {},
|
167 |
+
"output_type": "execute_result"
|
168 |
+
}
|
169 |
+
],
|
170 |
+
"source": [
|
171 |
+
"train_df['subtask_a'].value_counts()"
|
172 |
+
]
|
173 |
+
},
|
174 |
+
{
|
175 |
+
"cell_type": "code",
|
176 |
+
"execution_count": 8,
|
177 |
+
"metadata": {},
|
178 |
+
"outputs": [],
|
179 |
+
"source": [
|
180 |
+
"data=train_df['tweet'].tolist()\n",
|
181 |
+
"for i in range(len(data)):\n",
|
182 |
+
" data[i] = data[i].replace('@USER','')\n",
|
183 |
+
" data[i] = data[i].replace('#','')\n",
|
184 |
+
" data[i] = data[i].replace('$','')\n",
|
185 |
+
" data[i] = emoji.demojize(data[i])\n",
|
186 |
+
" \n",
|
187 |
+
"train_df['tweet'] = data"
|
188 |
+
]
|
189 |
+
},
|
190 |
+
{
|
191 |
+
"cell_type": "code",
|
192 |
+
"execution_count": 9,
|
193 |
+
"metadata": {},
|
194 |
+
"outputs": [],
|
195 |
+
"source": [
|
196 |
+
"lab = LabelEncoder()\n",
|
197 |
+
"train_df['subtask_a'] = lab.fit_transform(train_df['subtask_a'])"
|
198 |
+
]
|
199 |
+
},
|
200 |
+
{
|
201 |
+
"cell_type": "code",
|
202 |
+
"execution_count": 10,
|
203 |
+
"metadata": {},
|
204 |
+
"outputs": [
|
205 |
+
{
|
206 |
+
"data": {
|
207 |
+
"text/plain": [
|
208 |
+
"subtask_a\n",
|
209 |
+
"0 25231\n",
|
210 |
+
"1 6046\n",
|
211 |
+
"2 3515\n",
|
212 |
+
"Name: count, dtype: int64"
|
213 |
+
]
|
214 |
+
},
|
215 |
+
"execution_count": 10,
|
216 |
+
"metadata": {},
|
217 |
+
"output_type": "execute_result"
|
218 |
+
}
|
219 |
+
],
|
220 |
+
"source": [
|
221 |
+
"train_df['subtask_a'].value_counts()"
|
222 |
+
]
|
223 |
+
},
|
224 |
+
{
|
225 |
+
"cell_type": "code",
|
226 |
+
"execution_count": 11,
|
227 |
+
"metadata": {},
|
228 |
+
"outputs": [],
|
229 |
+
"source": [
|
230 |
+
"train_df.drop(train_df[train_df['subtask_a'] == 2].index, inplace = True)"
|
231 |
+
]
|
232 |
+
},
|
233 |
+
{
|
234 |
+
"cell_type": "code",
|
235 |
+
"execution_count": 12,
|
236 |
+
"metadata": {},
|
237 |
+
"outputs": [
|
238 |
+
{
|
239 |
+
"data": {
|
240 |
+
"text/plain": [
|
241 |
+
"subtask_a\n",
|
242 |
+
"0 22345\n",
|
243 |
+
"1 5417\n",
|
244 |
+
"Name: count, dtype: int64"
|
245 |
+
]
|
246 |
+
},
|
247 |
+
"execution_count": 12,
|
248 |
+
"metadata": {},
|
249 |
+
"output_type": "execute_result"
|
250 |
+
}
|
251 |
+
],
|
252 |
+
"source": [
|
253 |
+
"train_df['subtask_a'].value_counts()"
|
254 |
+
]
|
255 |
+
},
|
256 |
+
{
|
257 |
+
"cell_type": "code",
|
258 |
+
"execution_count": 13,
|
259 |
+
"metadata": {},
|
260 |
+
"outputs": [
|
261 |
+
{
|
262 |
+
"data": {
|
263 |
+
"text/html": [
|
264 |
+
"<div>\n",
|
265 |
+
"<style scoped>\n",
|
266 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
267 |
+
" vertical-align: middle;\n",
|
268 |
+
" }\n",
|
269 |
+
"\n",
|
270 |
+
" .dataframe tbody tr th {\n",
|
271 |
+
" vertical-align: top;\n",
|
272 |
+
" }\n",
|
273 |
+
"\n",
|
274 |
+
" .dataframe thead th {\n",
|
275 |
+
" text-align: right;\n",
|
276 |
+
" }\n",
|
277 |
+
"</style>\n",
|
278 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
279 |
+
" <thead>\n",
|
280 |
+
" <tr style=\"text-align: right;\">\n",
|
281 |
+
" <th></th>\n",
|
282 |
+
" <th>tweet</th>\n",
|
283 |
+
" <th>subtask_a</th>\n",
|
284 |
+
" </tr>\n",
|
285 |
+
" </thead>\n",
|
286 |
+
" <tbody>\n",
|
287 |
+
" <tr>\n",
|
288 |
+
" <th>3515</th>\n",
|
289 |
+
" <td>holstein ineği (alacalı siyah-beyaz inek, yani...</td>\n",
|
290 |
+
" <td>0</td>\n",
|
291 |
+
" </tr>\n",
|
292 |
+
" <tr>\n",
|
293 |
+
" <th>3516</th>\n",
|
294 |
+
" <td>Haaaa. O zaman oylar Binali'ye demek.</td>\n",
|
295 |
+
" <td>0</td>\n",
|
296 |
+
" </tr>\n",
|
297 |
+
" <tr>\n",
|
298 |
+
" <th>3517</th>\n",
|
299 |
+
" <td>Disk genel merkez yönetimine HDP'nin hiç etki...</td>\n",
|
300 |
+
" <td>0</td>\n",
|
301 |
+
" </tr>\n",
|
302 |
+
" <tr>\n",
|
303 |
+
" <th>3518</th>\n",
|
304 |
+
" <td>Bir insanı zorla kaliteli yapamazsın. Sen elin...</td>\n",
|
305 |
+
" <td>0</td>\n",
|
306 |
+
" </tr>\n",
|
307 |
+
" <tr>\n",
|
308 |
+
" <th>3519</th>\n",
|
309 |
+
" <td>Sus yaa açtım sonra korkudan telefon elimden ...</td>\n",
|
310 |
+
" <td>0</td>\n",
|
311 |
+
" </tr>\n",
|
312 |
+
" <tr>\n",
|
313 |
+
" <th>...</th>\n",
|
314 |
+
" <td>...</td>\n",
|
315 |
+
" <td>...</td>\n",
|
316 |
+
" </tr>\n",
|
317 |
+
" <tr>\n",
|
318 |
+
" <th>31272</th>\n",
|
319 |
+
" <td>Bu ödül sunan kızı kim giydirdiyse, kızın en b...</td>\n",
|
320 |
+
" <td>0</td>\n",
|
321 |
+
" </tr>\n",
|
322 |
+
" <tr>\n",
|
323 |
+
" <th>31273</th>\n",
|
324 |
+
" <td>Bunu sana beddua olarak etmiyorum bunlar ilerd...</td>\n",
|
325 |
+
" <td>0</td>\n",
|
326 |
+
" </tr>\n",
|
327 |
+
" <tr>\n",
|
328 |
+
" <th>31274</th>\n",
|
329 |
+
" <td>CHP'liler sandıkları bırakmıyor üstüne oturmuş...</td>\n",
|
330 |
+
" <td>1</td>\n",
|
331 |
+
" </tr>\n",
|
332 |
+
" <tr>\n",
|
333 |
+
" <th>31275</th>\n",
|
334 |
+
" <td>karanlığın içinde yalnız kalsam ne oluuuuurr</td>\n",
|
335 |
+
" <td>0</td>\n",
|
336 |
+
" </tr>\n",
|
337 |
+
" <tr>\n",
|
338 |
+
" <th>31276</th>\n",
|
339 |
+
" <td>Ne yalan söyleyeyim bu haftalıkta fitil olara...</td>\n",
|
340 |
+
" <td>0</td>\n",
|
341 |
+
" </tr>\n",
|
342 |
+
" </tbody>\n",
|
343 |
+
"</table>\n",
|
344 |
+
"<p>27762 rows × 2 columns</p>\n",
|
345 |
+
"</div>"
|
346 |
+
],
|
347 |
+
"text/plain": [
|
348 |
+
" tweet subtask_a\n",
|
349 |
+
"3515 holstein ineği (alacalı siyah-beyaz inek, yani... 0\n",
|
350 |
+
"3516 Haaaa. O zaman oylar Binali'ye demek. 0\n",
|
351 |
+
"3517 Disk genel merkez yönetimine HDP'nin hiç etki... 0\n",
|
352 |
+
"3518 Bir insanı zorla kaliteli yapamazsın. Sen elin... 0\n",
|
353 |
+
"3519 Sus yaa açtım sonra korkudan telefon elimden ... 0\n",
|
354 |
+
"... ... ...\n",
|
355 |
+
"31272 Bu ödül sunan kızı kim giydirdiyse, kızın en b... 0\n",
|
356 |
+
"31273 Bunu sana beddua olarak etmiyorum bunlar ilerd... 0\n",
|
357 |
+
"31274 CHP'liler sandıkları bırakmıyor üstüne oturmuş... 1\n",
|
358 |
+
"31275 karanlığın içinde yalnız kalsam ne oluuuuurr 0\n",
|
359 |
+
"31276 Ne yalan söyleyeyim bu haftalıkta fitil olara... 0\n",
|
360 |
+
"\n",
|
361 |
+
"[27762 rows x 2 columns]"
|
362 |
+
]
|
363 |
+
},
|
364 |
+
"execution_count": 13,
|
365 |
+
"metadata": {},
|
366 |
+
"output_type": "execute_result"
|
367 |
+
}
|
368 |
+
],
|
369 |
+
"source": [
|
370 |
+
"train_df"
|
371 |
+
]
|
372 |
+
},
|
373 |
+
{
|
374 |
+
"cell_type": "code",
|
375 |
+
"execution_count": 14,
|
376 |
+
"metadata": {},
|
377 |
+
"outputs": [],
|
378 |
+
"source": [
|
379 |
+
"data = train_df.tweet.values\n",
|
380 |
+
"labels = train_df.subtask_a.values"
|
381 |
+
]
|
382 |
+
},
|
383 |
+
{
|
384 |
+
"attachments": {},
|
385 |
+
"cell_type": "markdown",
|
386 |
+
"metadata": {},
|
387 |
+
"source": [
|
388 |
+
"# BERT Tokenizer"
|
389 |
+
]
|
390 |
+
},
|
391 |
+
{
|
392 |
+
"cell_type": "code",
|
393 |
+
"execution_count": 15,
|
394 |
+
"metadata": {},
|
395 |
+
"outputs": [],
|
396 |
+
"source": [
|
397 |
+
"tokenizer = BertTokenizer.from_pretrained(\"bert-base-multilingual-cased\", do_basic_tokenize=True)\n",
|
398 |
+
"# tokenizer.add_tokens(data)"
|
399 |
+
]
|
400 |
+
},
|
401 |
+
{
|
402 |
+
"cell_type": "code",
|
403 |
+
"execution_count": 16,
|
404 |
+
"metadata": {},
|
405 |
+
"outputs": [
|
406 |
+
{
|
407 |
+
"name": "stdout",
|
408 |
+
"output_type": "stream",
|
409 |
+
"text": [
|
410 |
+
" Original: Sallandık diyorum, merkezi bilmiyorum, sokağa fırlamadım, duruyorum. Senden bir açıklama gelmeden, ben bu sandığı terketmiyorum \n",
|
411 |
+
"Tokenized: ['Sal', '##landı', '##k', 'di', '##yor', '##um', ',', 'merkezi', 'bil', '##mi', '##yor', '##um', ',', 'sok', '##a', '##ğa', 'f', '##ır', '##lama', '##dı', '##m', ',', 'dur', '##uy', '##orum', '.', 'Sen', '##den', 'bir', 'açık', '##lama', 'gel', '##mede', '##n', ',', 'ben', 'bu', 'sand', '##ığı', 'ter', '##ket', '##mi', '##yor', '##um']\n",
|
412 |
+
"Token IDs: [64831, 35783, 10174, 10120, 26101, 10465, 117, 47522, 13897, 10500, 26101, 10465, 117, 29509, 10113, 25163, 174, 17145, 24540, 17532, 10147, 117, 28959, 53452, 28048, 119, 18082, 10633, 10561, 71769, 24540, 74458, 59268, 10115, 117, 11015, 11499, 45989, 28581, 12718, 13650, 10500, 26101, 10465]\n"
|
413 |
+
]
|
414 |
+
}
|
415 |
+
],
|
416 |
+
"source": [
|
417 |
+
"print(' Original: ', data[78])\n",
|
418 |
+
"print('Tokenized: ', tokenizer.tokenize(data[78]))\n",
|
419 |
+
"print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(data[78])))"
|
420 |
+
]
|
421 |
+
},
|
422 |
+
{
|
423 |
+
"attachments": {},
|
424 |
+
"cell_type": "markdown",
|
425 |
+
"metadata": {},
|
426 |
+
"source": [
|
427 |
+
"# Tokenize Dataset"
|
428 |
+
]
|
429 |
+
},
|
430 |
+
{
|
431 |
+
"cell_type": "code",
|
432 |
+
"execution_count": 17,
|
433 |
+
"metadata": {},
|
434 |
+
"outputs": [
|
435 |
+
{
|
436 |
+
"name": "stderr",
|
437 |
+
"output_type": "stream",
|
438 |
+
"text": [
|
439 |
+
"Token indices sequence length is longer than the specified maximum sequence length for this model (1277 > 512). Running this sequence through the model will result in indexing errors\n"
|
440 |
+
]
|
441 |
+
},
|
442 |
+
{
|
443 |
+
"name": "stdout",
|
444 |
+
"output_type": "stream",
|
445 |
+
"text": [
|
446 |
+
"Max sentence length: 6906\n"
|
447 |
+
]
|
448 |
+
}
|
449 |
+
],
|
450 |
+
"source": [
|
451 |
+
"max_len = 0\n",
|
452 |
+
"for sent in data:\n",
|
453 |
+
"\n",
|
454 |
+
" input_ids = tokenizer.encode(sent, add_special_tokens=True)\n",
|
455 |
+
" max_len = max(max_len, len(input_ids))\n",
|
456 |
+
"\n",
|
457 |
+
"print('Max sentence length: ', max_len)"
|
458 |
+
]
|
459 |
+
},
|
460 |
+
{
|
461 |
+
"cell_type": "code",
|
462 |
+
"execution_count": 18,
|
463 |
+
"metadata": {},
|
464 |
+
"outputs": [
|
465 |
+
{
|
466 |
+
"name": "stderr",
|
467 |
+
"output_type": "stream",
|
468 |
+
"text": [
|
469 |
+
"Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n",
|
470 |
+
"/home/sebit/anaconda3/envs/testenv/lib/python3.9/site-packages/transformers/tokenization_utils_base.py:2418: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n",
|
471 |
+
" warnings.warn(\n"
|
472 |
+
]
|
473 |
+
},
|
474 |
+
{
|
475 |
+
"name": "stdout",
|
476 |
+
"output_type": "stream",
|
477 |
+
"text": [
|
478 |
+
"Original: holstein ineği (alacalı siyah-beyaz inek, yani hollanda ineği) en verimli süt alınan inek ırkıymış, trt belgesel'de öyle söylediler\n",
|
479 |
+
"Token IDs: tensor([ 101, 110516, 16206, 10106, 10112, 16054, 113, 21739, 15794,\n",
|
480 |
+
" 10713, 34543, 10237, 118, 110744, 10106, 10707, 117, 84251,\n",
|
481 |
+
" 46232, 41971, 10106, 10112, 16054, 114, 10110, 55011, 98373,\n",
|
482 |
+
" 187, 41559, 10164, 65890, 10106, 10707, 321, 16299, 10713,\n",
|
483 |
+
" 16889, 19733, 117, 32221, 10123, 34831, 12912, 112, 10104,\n",
|
484 |
+
" 276, 18369, 100721, 18369, 28113, 10165, 102, 0, 0,\n",
|
485 |
+
" 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
|
486 |
+
" 0])\n"
|
487 |
+
]
|
488 |
+
}
|
489 |
+
],
|
490 |
+
"source": [
|
491 |
+
"input_ids = []\n",
|
492 |
+
"attention_masks = []\n",
|
493 |
+
"\n",
|
494 |
+
"for sent in data:\n",
|
495 |
+
" encoded_dict = tokenizer.encode_plus(\n",
|
496 |
+
" sent, \n",
|
497 |
+
" add_special_tokens = True, \n",
|
498 |
+
" max_length = 64, \n",
|
499 |
+
" pad_to_max_length = True,\n",
|
500 |
+
" return_attention_mask = True, \n",
|
501 |
+
" return_tensors = 'pt', \n",
|
502 |
+
" )\n",
|
503 |
+
" \n",
|
504 |
+
" \n",
|
505 |
+
" input_ids.append(encoded_dict['input_ids'])\n",
|
506 |
+
" attention_masks.append(encoded_dict['attention_mask'])\n",
|
507 |
+
"\n",
|
508 |
+
"\n",
|
509 |
+
"input_ids = torch.cat(input_ids, dim=0)\n",
|
510 |
+
"attention_masks = torch.cat(attention_masks, dim=0)\n",
|
511 |
+
"labels = torch.tensor(labels)\n",
|
512 |
+
"\n",
|
513 |
+
"\n",
|
514 |
+
"print('Original: ', data[0])\n",
|
515 |
+
"print('Token IDs:', input_ids[0])"
|
516 |
+
]
|
517 |
+
},
|
518 |
+
{
|
519 |
+
"attachments": {},
|
520 |
+
"cell_type": "markdown",
|
521 |
+
"metadata": {},
|
522 |
+
"source": [
|
523 |
+
"# Split Dataset"
|
524 |
+
]
|
525 |
+
},
|
526 |
+
{
|
527 |
+
"cell_type": "code",
|
528 |
+
"execution_count": 19,
|
529 |
+
"metadata": {},
|
530 |
+
"outputs": [
|
531 |
+
{
|
532 |
+
"name": "stdout",
|
533 |
+
"output_type": "stream",
|
534 |
+
"text": [
|
535 |
+
"24,985 training samples\n",
|
536 |
+
"2,777 validation samples\n"
|
537 |
+
]
|
538 |
+
}
|
539 |
+
],
|
540 |
+
"source": [
|
541 |
+
"dataset = TensorDataset(input_ids, attention_masks, labels)\n",
|
542 |
+
"train_size = int(0.9 * len(dataset))\n",
|
543 |
+
"val_size = len(dataset) - train_size\n",
|
544 |
+
"\n",
|
545 |
+
"\n",
|
546 |
+
"train_dataset, val_dataset = random_split(dataset, [train_size, val_size])\n",
|
547 |
+
"\n",
|
548 |
+
"print('{:>5,} training samples'.format(train_size))\n",
|
549 |
+
"print('{:>5,} validation samples'.format(val_size))"
|
550 |
+
]
|
551 |
+
},
|
552 |
+
{
|
553 |
+
"cell_type": "code",
|
554 |
+
"execution_count": 20,
|
555 |
+
"metadata": {},
|
556 |
+
"outputs": [
|
557 |
+
{
|
558 |
+
"name": "stderr",
|
559 |
+
"output_type": "stream",
|
560 |
+
"text": [
|
561 |
+
"Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']\n",
|
562 |
+
"You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
|
563 |
+
]
|
564 |
+
},
|
565 |
+
{
|
566 |
+
"data": {
|
567 |
+
"text/plain": [
|
568 |
+
"BertForSequenceClassification(\n",
|
569 |
+
" (bert): BertModel(\n",
|
570 |
+
" (embeddings): BertEmbeddings(\n",
|
571 |
+
" (word_embeddings): Embedding(119547, 768, padding_idx=0)\n",
|
572 |
+
" (position_embeddings): Embedding(512, 768)\n",
|
573 |
+
" (token_type_embeddings): Embedding(2, 768)\n",
|
574 |
+
" (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
|
575 |
+
" (dropout): Dropout(p=0.1, inplace=False)\n",
|
576 |
+
" )\n",
|
577 |
+
" (encoder): BertEncoder(\n",
|
578 |
+
" (layer): ModuleList(\n",
|
579 |
+
" (0-11): 12 x BertLayer(\n",
|
580 |
+
" (attention): BertAttention(\n",
|
581 |
+
" (self): BertSelfAttention(\n",
|
582 |
+
" (query): Linear(in_features=768, out_features=768, bias=True)\n",
|
583 |
+
" (key): Linear(in_features=768, out_features=768, bias=True)\n",
|
584 |
+
" (value): Linear(in_features=768, out_features=768, bias=True)\n",
|
585 |
+
" (dropout): Dropout(p=0.1, inplace=False)\n",
|
586 |
+
" )\n",
|
587 |
+
" (output): BertSelfOutput(\n",
|
588 |
+
" (dense): Linear(in_features=768, out_features=768, bias=True)\n",
|
589 |
+
" (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
|
590 |
+
" (dropout): Dropout(p=0.1, inplace=False)\n",
|
591 |
+
" )\n",
|
592 |
+
" )\n",
|
593 |
+
" (intermediate): BertIntermediate(\n",
|
594 |
+
" (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
|
595 |
+
" (intermediate_act_fn): GELUActivation()\n",
|
596 |
+
" )\n",
|
597 |
+
" (output): BertOutput(\n",
|
598 |
+
" (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
|
599 |
+
" (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
|
600 |
+
" (dropout): Dropout(p=0.1, inplace=False)\n",
|
601 |
+
" )\n",
|
602 |
+
" )\n",
|
603 |
+
" )\n",
|
604 |
+
" )\n",
|
605 |
+
" (pooler): BertPooler(\n",
|
606 |
+
" (dense): Linear(in_features=768, out_features=768, bias=True)\n",
|
607 |
+
" (activation): Tanh()\n",
|
608 |
+
" )\n",
|
609 |
+
" )\n",
|
610 |
+
" (dropout): Dropout(p=0.1, inplace=False)\n",
|
611 |
+
" (classifier): Linear(in_features=768, out_features=2, bias=True)\n",
|
612 |
+
")"
|
613 |
+
]
|
614 |
+
},
|
615 |
+
"execution_count": 20,
|
616 |
+
"metadata": {},
|
617 |
+
"output_type": "execute_result"
|
618 |
+
}
|
619 |
+
],
|
620 |
+
"source": [
|
621 |
+
"from transformers import BertForSequenceClassification, AdamW, BertConfig\n",
|
622 |
+
"\n",
|
623 |
+
"model = BertForSequenceClassification.from_pretrained(\n",
|
624 |
+
" \"bert-base-multilingual-cased\",\n",
|
625 |
+
" num_labels = 2, \n",
|
626 |
+
" output_attentions = False,\n",
|
627 |
+
" output_hidden_states = False, \n",
|
628 |
+
")\n",
|
629 |
+
"\n",
|
630 |
+
"model.cuda()"
|
631 |
+
]
|
632 |
+
},
|
633 |
+
{
|
634 |
+
"cell_type": "code",
|
635 |
+
"execution_count": 21,
|
636 |
+
"metadata": {},
|
637 |
+
"outputs": [
|
638 |
+
{
|
639 |
+
"name": "stdout",
|
640 |
+
"output_type": "stream",
|
641 |
+
"text": [
|
642 |
+
"The BERT model has 201 different named parameters.\n",
|
643 |
+
"\n",
|
644 |
+
"==== Embedding Layer ====\n",
|
645 |
+
"\n",
|
646 |
+
"bert.embeddings.word_embeddings.weight (119547, 768)\n",
|
647 |
+
"bert.embeddings.position_embeddings.weight (512, 768)\n",
|
648 |
+
"bert.embeddings.token_type_embeddings.weight (2, 768)\n",
|
649 |
+
"bert.embeddings.LayerNorm.weight (768,)\n",
|
650 |
+
"bert.embeddings.LayerNorm.bias (768,)\n",
|
651 |
+
"\n",
|
652 |
+
"==== First Transformer ====\n",
|
653 |
+
"\n",
|
654 |
+
"bert.encoder.layer.0.attention.self.query.weight (768, 768)\n",
|
655 |
+
"bert.encoder.layer.0.attention.self.query.bias (768,)\n",
|
656 |
+
"bert.encoder.layer.0.attention.self.key.weight (768, 768)\n",
|
657 |
+
"bert.encoder.layer.0.attention.self.key.bias (768,)\n",
|
658 |
+
"bert.encoder.layer.0.attention.self.value.weight (768, 768)\n",
|
659 |
+
"bert.encoder.layer.0.attention.self.value.bias (768,)\n",
|
660 |
+
"bert.encoder.layer.0.attention.output.dense.weight (768, 768)\n",
|
661 |
+
"bert.encoder.layer.0.attention.output.dense.bias (768,)\n",
|
662 |
+
"bert.encoder.layer.0.attention.output.LayerNorm.weight (768,)\n",
|
663 |
+
"bert.encoder.layer.0.attention.output.LayerNorm.bias (768,)\n",
|
664 |
+
"bert.encoder.layer.0.intermediate.dense.weight (3072, 768)\n",
|
665 |
+
"bert.encoder.layer.0.intermediate.dense.bias (3072,)\n",
|
666 |
+
"bert.encoder.layer.0.output.dense.weight (768, 3072)\n",
|
667 |
+
"bert.encoder.layer.0.output.dense.bias (768,)\n",
|
668 |
+
"bert.encoder.layer.0.output.LayerNorm.weight (768,)\n",
|
669 |
+
"bert.encoder.layer.0.output.LayerNorm.bias (768,)\n",
|
670 |
+
"\n",
|
671 |
+
"==== Output Layer ====\n",
|
672 |
+
"\n",
|
673 |
+
"bert.pooler.dense.weight (768, 768)\n",
|
674 |
+
"bert.pooler.dense.bias (768,)\n",
|
675 |
+
"classifier.weight (2, 768)\n",
|
676 |
+
"classifier.bias (2,)\n"
|
677 |
+
]
|
678 |
+
}
|
679 |
+
],
|
680 |
+
"source": [
|
681 |
+
"params = list(model.named_parameters())\n",
|
682 |
+
"\n",
|
683 |
+
"print('The BERT model has {:} different named parameters.\\n'.format(len(params)))\n",
|
684 |
+
"\n",
|
685 |
+
"print('==== Embedding Layer ====\\n')\n",
|
686 |
+
"\n",
|
687 |
+
"for p in params[0:5]:\n",
|
688 |
+
" print(\"{:<55} {:>12}\".format(p[0], str(tuple(p[1].size()))))\n",
|
689 |
+
"\n",
|
690 |
+
"print('\\n==== First Transformer ====\\n')\n",
|
691 |
+
"\n",
|
692 |
+
"for p in params[5:21]:\n",
|
693 |
+
" print(\"{:<55} {:>12}\".format(p[0], str(tuple(p[1].size()))))\n",
|
694 |
+
"\n",
|
695 |
+
"print('\\n==== Output Layer ====\\n')\n",
|
696 |
+
"\n",
|
697 |
+
"for p in params[-4:]:\n",
|
698 |
+
" print(\"{:<55} {:>12}\".format(p[0], str(tuple(p[1].size()))))"
|
699 |
+
]
|
700 |
+
},
|
701 |
+
{
|
702 |
+
"cell_type": "code",
|
703 |
+
"execution_count": 22,
|
704 |
+
"metadata": {},
|
705 |
+
"outputs": [
|
706 |
+
{
|
707 |
+
"name": "stderr",
|
708 |
+
"output_type": "stream",
|
709 |
+
"text": [
|
710 |
+
"/home/sebit/anaconda3/envs/testenv/lib/python3.9/site-packages/transformers/optimization.py:411: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n",
|
711 |
+
" warnings.warn(\n"
|
712 |
+
]
|
713 |
+
}
|
714 |
+
],
|
715 |
+
"source": [
|
716 |
+
"optimizer = AdamW(model.parameters(),\n",
|
717 |
+
" lr = 2e-5,\n",
|
718 |
+
" eps = 1e-8\n",
|
719 |
+
" )"
|
720 |
+
]
|
721 |
+
},
|
722 |
+
{
|
723 |
+
"cell_type": "code",
|
724 |
+
"execution_count": 23,
|
725 |
+
"metadata": {},
|
726 |
+
"outputs": [],
|
727 |
+
"source": [
|
728 |
+
"def flat_accuracy(preds, labels):\n",
|
729 |
+
" pred_flat = np.argmax(preds, axis=1).flatten()\n",
|
730 |
+
" labels_flat = labels.flatten()\n",
|
731 |
+
" return np.sum(pred_flat == labels_flat) / len(labels_flat)"
|
732 |
+
]
|
733 |
+
},
|
734 |
+
{
|
735 |
+
"cell_type": "code",
|
736 |
+
"execution_count": 24,
|
737 |
+
"metadata": {},
|
738 |
+
"outputs": [],
|
739 |
+
"source": [
|
740 |
+
"def format_time(elapsed):\n",
|
741 |
+
"\n",
|
742 |
+
" elapsed_rounded = int(round((elapsed)))\n",
|
743 |
+
" return str(datetime.timedelta(seconds=elapsed_rounded))\n"
|
744 |
+
]
|
745 |
+
},
|
746 |
+
{
|
747 |
+
"cell_type": "code",
|
748 |
+
"execution_count": 25,
|
749 |
+
"metadata": {},
|
750 |
+
"outputs": [],
|
751 |
+
"source": [
|
752 |
+
"class sinKafModel(pl.LightningModule):\n",
|
753 |
+
" def __init__(self, model, optimizer, scheduler):\n",
|
754 |
+
" super().__init__()\n",
|
755 |
+
" self.model = model\n",
|
756 |
+
" self.optimizer = optimizer\n",
|
757 |
+
" self.scheduler = scheduler\n",
|
758 |
+
"\n",
|
759 |
+
"\n",
|
760 |
+
" def forward(self, input_ids, attention_mask, labels):\n",
|
761 |
+
" outputs = self.model(input_ids, attention_mask=attention_mask, labels=labels)\n",
|
762 |
+
" return outputs\n",
|
763 |
+
"\n",
|
764 |
+
" def training_step(self, batch, batch_idx):\n",
|
765 |
+
" input_ids, input_mask, labels = batch\n",
|
766 |
+
" outputs = self(input_ids, input_mask, labels)\n",
|
767 |
+
" loss = outputs.loss\n",
|
768 |
+
" self.log('train_loss', loss)\n",
|
769 |
+
" return loss\n",
|
770 |
+
"\n",
|
771 |
+
" def validation_step(self, batch, batch_idx):\n",
|
772 |
+
" input_ids, input_mask, labels = batch\n",
|
773 |
+
" outputs = self(input_ids, input_mask, labels)\n",
|
774 |
+
" loss = outputs.loss\n",
|
775 |
+
" logits = outputs.logits\n",
|
776 |
+
" preds = torch.argmax(logits, dim=1)\n",
|
777 |
+
" acc = (preds == labels).sum().item() / len(labels)\n",
|
778 |
+
" self.log('val_loss', loss)\n",
|
779 |
+
" self.log('val_acc', acc)\n",
|
780 |
+
" return loss\n",
|
781 |
+
"\n",
|
782 |
+
" def configure_optimizers(self):\n",
|
783 |
+
" return [self.optimizer], [self.scheduler]\n",
|
784 |
+
"\n",
|
785 |
+
" # def train_dataloader(self):\n",
|
786 |
+
" # return self.train_dataloader\n",
|
787 |
+
"\n",
|
788 |
+
" # def val_dataloader(self):\n",
|
789 |
+
" # return self.validation_dataloader\n"
|
790 |
+
]
|
791 |
+
},
|
792 |
+
{
|
793 |
+
"cell_type": "code",
|
794 |
+
"execution_count": 26,
|
795 |
+
"metadata": {},
|
796 |
+
"outputs": [],
|
797 |
+
"source": [
|
798 |
+
"train_dataloader = DataLoader(train_dataset, sampler = RandomSampler(train_dataset), batch_size = 2 )\n",
|
799 |
+
"validation_dataloader = DataLoader(val_dataset, sampler = SequentialSampler(val_dataset), batch_size = 2 )"
|
800 |
+
]
|
801 |
+
},
|
802 |
+
{
|
803 |
+
"cell_type": "code",
|
804 |
+
"execution_count": 27,
|
805 |
+
"metadata": {},
|
806 |
+
"outputs": [],
|
807 |
+
"source": [
|
808 |
+
"epochs = 4\n",
|
809 |
+
"total_steps = len(train_dataloader) * epochs\n",
|
810 |
+
"scheduler = get_linear_schedule_with_warmup(optimizer, \n",
|
811 |
+
" num_warmup_steps = 0, \n",
|
812 |
+
" num_training_steps = total_steps)"
|
813 |
+
]
|
814 |
+
},
|
815 |
+
{
|
816 |
+
"cell_type": "code",
|
817 |
+
"execution_count": 28,
|
818 |
+
"metadata": {},
|
819 |
+
"outputs": [
|
820 |
+
{
|
821 |
+
"name": "stderr",
|
822 |
+
"output_type": "stream",
|
823 |
+
"text": [
|
824 |
+
"GPU available: True (cuda), used: True\n",
|
825 |
+
"TPU available: False, using: 0 TPU cores\n",
|
826 |
+
"IPU available: False, using: 0 IPUs\n",
|
827 |
+
"HPU available: False, using: 0 HPUs\n",
|
828 |
+
"/home/sebit/anaconda3/envs/testenv/lib/python3.9/site-packages/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py:67: UserWarning: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `pytorch_lightning` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default\n",
|
829 |
+
" warning_cache.warn(\n"
|
830 |
+
]
|
831 |
+
},
|
832 |
+
{
|
833 |
+
"name": "stderr",
|
834 |
+
"output_type": "stream",
|
835 |
+
"text": [
|
836 |
+
"LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n",
|
837 |
+
"\n",
|
838 |
+
" | Name | Type | Params\n",
|
839 |
+
"--------------------------------------------------------\n",
|
840 |
+
"0 | model | BertForSequenceClassification | 177 M \n",
|
841 |
+
"--------------------------------------------------------\n",
|
842 |
+
"177 M Trainable params\n",
|
843 |
+
"0 Non-trainable params\n",
|
844 |
+
"177 M Total params\n",
|
845 |
+
"711.420 Total estimated model params size (MB)\n"
|
846 |
+
]
|
847 |
+
},
|
848 |
+
{
|
849 |
+
"name": "stdout",
|
850 |
+
"output_type": "stream",
|
851 |
+
"text": [
|
852 |
+
"Sanity Checking DataLoader 0: 0%| | 0/2 [00:00<?, ?it/s]"
|
853 |
+
]
|
854 |
+
},
|
855 |
+
{
|
856 |
+
"name": "stderr",
|
857 |
+
"output_type": "stream",
|
858 |
+
"text": [
|
859 |
+
"/home/sebit/anaconda3/envs/testenv/lib/python3.9/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:442: PossibleUserWarning: The dataloader, val_dataloader, does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` (try 8 which is the number of cpus on this machine) in the `DataLoader` init to improve performance.\n",
|
860 |
+
" rank_zero_warn(\n"
|
861 |
+
]
|
862 |
+
},
|
863 |
+
{
|
864 |
+
"name": "stdout",
|
865 |
+
"output_type": "stream",
|
866 |
+
"text": [
|
867 |
+
" "
|
868 |
+
]
|
869 |
+
},
|
870 |
+
{
|
871 |
+
"name": "stderr",
|
872 |
+
"output_type": "stream",
|
873 |
+
"text": [
|
874 |
+
"/home/sebit/anaconda3/envs/testenv/lib/python3.9/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:442: PossibleUserWarning: The dataloader, train_dataloader, does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` (try 8 which is the number of cpus on this machine) in the `DataLoader` init to improve performance.\n",
|
875 |
+
" rank_zero_warn(\n"
|
876 |
+
]
|
877 |
+
},
|
878 |
+
{
|
879 |
+
"name": "stdout",
|
880 |
+
"output_type": "stream",
|
881 |
+
"text": [
|
882 |
+
"Epoch 0: 0%| | 1/1249 [00:00<05:01, 4.13it/s, v_num=6]"
|
883 |
+
]
|
884 |
+
},
|
885 |
+
{
|
886 |
+
"ename": "OutOfMemoryError",
|
887 |
+
"evalue": "CUDA out of memory. Tried to allocate 352.00 MiB (GPU 0; 4.00 GiB total capacity; 2.67 GiB already allocated; 0 bytes free; 2.80 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF",
|
888 |
+
"output_type": "error",
|
889 |
+
"traceback": [
|
890 |
+
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
891 |
+
"\u001b[0;31mOutOfMemoryError\u001b[0m Traceback (most recent call last)",
|
892 |
+
"Cell \u001b[0;32mIn[28], line 3\u001b[0m\n\u001b[1;32m 1\u001b[0m model \u001b[39m=\u001b[39m sinKafModel(model, optimizer, scheduler)\n\u001b[1;32m 2\u001b[0m trainer \u001b[39m=\u001b[39m pl\u001b[39m.\u001b[39mTrainer( max_epochs\u001b[39m=\u001b[39mepochs, limit_train_batches\u001b[39m=\u001b[39m\u001b[39m0.1\u001b[39m, devices\u001b[39m=\u001b[39m\u001b[39m1\u001b[39m, accelerator\u001b[39m=\u001b[39m\u001b[39m'\u001b[39m\u001b[39mgpu\u001b[39m\u001b[39m'\u001b[39m) \n\u001b[0;32m----> 3\u001b[0m trainer\u001b[39m.\u001b[39;49mfit(model,train_dataloader,validation_dataloader )\n",
|
893 |
+
"File \u001b[0;32m~/anaconda3/envs/testenv/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py:532\u001b[0m, in \u001b[0;36mTrainer.fit\u001b[0;34m(self, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path)\u001b[0m\n\u001b[1;32m 530\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mstrategy\u001b[39m.\u001b[39m_lightning_module \u001b[39m=\u001b[39m model\n\u001b[1;32m 531\u001b[0m _verify_strategy_supports_compile(model, \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mstrategy)\n\u001b[0;32m--> 532\u001b[0m call\u001b[39m.\u001b[39;49m_call_and_handle_interrupt(\n\u001b[1;32m 533\u001b[0m \u001b[39mself\u001b[39;49m, \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_fit_impl, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path\n\u001b[1;32m 534\u001b[0m )\n",
|
894 |
+
"File \u001b[0;32m~/anaconda3/envs/testenv/lib/python3.9/site-packages/pytorch_lightning/trainer/call.py:43\u001b[0m, in \u001b[0;36m_call_and_handle_interrupt\u001b[0;34m(trainer, trainer_fn, *args, **kwargs)\u001b[0m\n\u001b[1;32m 41\u001b[0m \u001b[39mif\u001b[39;00m trainer\u001b[39m.\u001b[39mstrategy\u001b[39m.\u001b[39mlauncher \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[1;32m 42\u001b[0m \u001b[39mreturn\u001b[39;00m trainer\u001b[39m.\u001b[39mstrategy\u001b[39m.\u001b[39mlauncher\u001b[39m.\u001b[39mlaunch(trainer_fn, \u001b[39m*\u001b[39margs, trainer\u001b[39m=\u001b[39mtrainer, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs)\n\u001b[0;32m---> 43\u001b[0m \u001b[39mreturn\u001b[39;00m trainer_fn(\u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[1;32m 45\u001b[0m \u001b[39mexcept\u001b[39;00m _TunerExitException:\n\u001b[1;32m 46\u001b[0m _call_teardown_hook(trainer)\n",
|
895 |
+
"File \u001b[0;32m~/anaconda3/envs/testenv/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py:571\u001b[0m, in \u001b[0;36mTrainer._fit_impl\u001b[0;34m(self, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path)\u001b[0m\n\u001b[1;32m 561\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_data_connector\u001b[39m.\u001b[39mattach_data(\n\u001b[1;32m 562\u001b[0m model, train_dataloaders\u001b[39m=\u001b[39mtrain_dataloaders, val_dataloaders\u001b[39m=\u001b[39mval_dataloaders, datamodule\u001b[39m=\u001b[39mdatamodule\n\u001b[1;32m 563\u001b[0m )\n\u001b[1;32m 565\u001b[0m ckpt_path \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_checkpoint_connector\u001b[39m.\u001b[39m_select_ckpt_path(\n\u001b[1;32m 566\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mstate\u001b[39m.\u001b[39mfn,\n\u001b[1;32m 567\u001b[0m ckpt_path,\n\u001b[1;32m 568\u001b[0m model_provided\u001b[39m=\u001b[39m\u001b[39mTrue\u001b[39;00m,\n\u001b[1;32m 569\u001b[0m model_connected\u001b[39m=\u001b[39m\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mlightning_module \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m,\n\u001b[1;32m 570\u001b[0m )\n\u001b[0;32m--> 571\u001b[0m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_run(model, ckpt_path\u001b[39m=\u001b[39;49mckpt_path)\n\u001b[1;32m 573\u001b[0m \u001b[39massert\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mstate\u001b[39m.\u001b[39mstopped\n\u001b[1;32m 574\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mtraining \u001b[39m=\u001b[39m \u001b[39mFalse\u001b[39;00m\n",
|
896 |
+
"File \u001b[0;32m~/anaconda3/envs/testenv/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py:980\u001b[0m, in \u001b[0;36mTrainer._run\u001b[0;34m(self, model, ckpt_path)\u001b[0m\n\u001b[1;32m 975\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_signal_connector\u001b[39m.\u001b[39mregister_signal_handlers()\n\u001b[1;32m 977\u001b[0m \u001b[39m# ----------------------------\u001b[39;00m\n\u001b[1;32m 978\u001b[0m \u001b[39m# RUN THE TRAINER\u001b[39;00m\n\u001b[1;32m 979\u001b[0m \u001b[39m# ----------------------------\u001b[39;00m\n\u001b[0;32m--> 980\u001b[0m results \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_run_stage()\n\u001b[1;32m 982\u001b[0m \u001b[39m# ----------------------------\u001b[39;00m\n\u001b[1;32m 983\u001b[0m \u001b[39m# POST-Training CLEAN UP\u001b[39;00m\n\u001b[1;32m 984\u001b[0m \u001b[39m# ----------------------------\u001b[39;00m\n\u001b[1;32m 985\u001b[0m log\u001b[39m.\u001b[39mdebug(\u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m{\u001b[39;00m\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m\u001b[39m__class__\u001b[39m\u001b[39m.\u001b[39m\u001b[39m__name__\u001b[39m\u001b[39m}\u001b[39;00m\u001b[39m: trainer tearing down\u001b[39m\u001b[39m\"\u001b[39m)\n",
|
897 |
+
"File \u001b[0;32m~/anaconda3/envs/testenv/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py:1023\u001b[0m, in \u001b[0;36mTrainer._run_stage\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 1021\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_run_sanity_check()\n\u001b[1;32m 1022\u001b[0m \u001b[39mwith\u001b[39;00m torch\u001b[39m.\u001b[39mautograd\u001b[39m.\u001b[39mset_detect_anomaly(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_detect_anomaly):\n\u001b[0;32m-> 1023\u001b[0m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mfit_loop\u001b[39m.\u001b[39;49mrun()\n\u001b[1;32m 1024\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mNone\u001b[39;00m\n\u001b[1;32m 1025\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mRuntimeError\u001b[39;00m(\u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mUnexpected state \u001b[39m\u001b[39m{\u001b[39;00m\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mstate\u001b[39m}\u001b[39;00m\u001b[39m\"\u001b[39m)\n",
|
898 |
+
"File \u001b[0;32m~/anaconda3/envs/testenv/lib/python3.9/site-packages/pytorch_lightning/loops/fit_loop.py:202\u001b[0m, in \u001b[0;36m_FitLoop.run\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 200\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[1;32m 201\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mon_advance_start()\n\u001b[0;32m--> 202\u001b[0m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49madvance()\n\u001b[1;32m 203\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mon_advance_end()\n\u001b[1;32m 204\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_restarting \u001b[39m=\u001b[39m \u001b[39mFalse\u001b[39;00m\n",
|
899 |
+
"File \u001b[0;32m~/anaconda3/envs/testenv/lib/python3.9/site-packages/pytorch_lightning/loops/fit_loop.py:355\u001b[0m, in \u001b[0;36m_FitLoop.advance\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 353\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_data_fetcher\u001b[39m.\u001b[39msetup(combined_loader)\n\u001b[1;32m 354\u001b[0m \u001b[39mwith\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mtrainer\u001b[39m.\u001b[39mprofiler\u001b[39m.\u001b[39mprofile(\u001b[39m\"\u001b[39m\u001b[39mrun_training_epoch\u001b[39m\u001b[39m\"\u001b[39m):\n\u001b[0;32m--> 355\u001b[0m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mepoch_loop\u001b[39m.\u001b[39;49mrun(\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_data_fetcher)\n",
|
900 |
+
"File \u001b[0;32m~/anaconda3/envs/testenv/lib/python3.9/site-packages/pytorch_lightning/loops/training_epoch_loop.py:133\u001b[0m, in \u001b[0;36m_TrainingEpochLoop.run\u001b[0;34m(self, data_fetcher)\u001b[0m\n\u001b[1;32m 131\u001b[0m \u001b[39mwhile\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mdone:\n\u001b[1;32m 132\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m--> 133\u001b[0m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49madvance(data_fetcher)\n\u001b[1;32m 134\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mon_advance_end()\n\u001b[1;32m 135\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_restarting \u001b[39m=\u001b[39m \u001b[39mFalse\u001b[39;00m\n",
|
901 |
+
"File \u001b[0;32m~/anaconda3/envs/testenv/lib/python3.9/site-packages/pytorch_lightning/loops/training_epoch_loop.py:219\u001b[0m, in \u001b[0;36m_TrainingEpochLoop.advance\u001b[0;34m(self, data_fetcher)\u001b[0m\n\u001b[1;32m 216\u001b[0m \u001b[39mwith\u001b[39;00m trainer\u001b[39m.\u001b[39mprofiler\u001b[39m.\u001b[39mprofile(\u001b[39m\"\u001b[39m\u001b[39mrun_training_batch\u001b[39m\u001b[39m\"\u001b[39m):\n\u001b[1;32m 217\u001b[0m \u001b[39mif\u001b[39;00m trainer\u001b[39m.\u001b[39mlightning_module\u001b[39m.\u001b[39mautomatic_optimization:\n\u001b[1;32m 218\u001b[0m \u001b[39m# in automatic optimization, there can only be one optimizer\u001b[39;00m\n\u001b[0;32m--> 219\u001b[0m batch_output \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mautomatic_optimization\u001b[39m.\u001b[39;49mrun(trainer\u001b[39m.\u001b[39;49moptimizers[\u001b[39m0\u001b[39;49m], kwargs)\n\u001b[1;32m 220\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m 221\u001b[0m batch_output \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mmanual_optimization\u001b[39m.\u001b[39mrun(kwargs)\n",
|
902 |
+
"File \u001b[0;32m~/anaconda3/envs/testenv/lib/python3.9/site-packages/pytorch_lightning/loops/optimization/automatic.py:188\u001b[0m, in \u001b[0;36m_AutomaticOptimization.run\u001b[0;34m(self, optimizer, kwargs)\u001b[0m\n\u001b[1;32m 181\u001b[0m closure()\n\u001b[1;32m 183\u001b[0m \u001b[39m# ------------------------------\u001b[39;00m\n\u001b[1;32m 184\u001b[0m \u001b[39m# BACKWARD PASS\u001b[39;00m\n\u001b[1;32m 185\u001b[0m \u001b[39m# ------------------------------\u001b[39;00m\n\u001b[1;32m 186\u001b[0m \u001b[39m# gradient update with accumulated gradients\u001b[39;00m\n\u001b[1;32m 187\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[0;32m--> 188\u001b[0m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_optimizer_step(kwargs\u001b[39m.\u001b[39;49mget(\u001b[39m\"\u001b[39;49m\u001b[39mbatch_idx\u001b[39;49m\u001b[39m\"\u001b[39;49m, \u001b[39m0\u001b[39;49m), closure)\n\u001b[1;32m 190\u001b[0m result \u001b[39m=\u001b[39m closure\u001b[39m.\u001b[39mconsume_result()\n\u001b[1;32m 191\u001b[0m \u001b[39mif\u001b[39;00m result\u001b[39m.\u001b[39mloss \u001b[39mis\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n",
|
903 |
+
"File \u001b[0;32m~/anaconda3/envs/testenv/lib/python3.9/site-packages/pytorch_lightning/loops/optimization/automatic.py:266\u001b[0m, in \u001b[0;36m_AutomaticOptimization._optimizer_step\u001b[0;34m(self, batch_idx, train_step_and_backward_closure)\u001b[0m\n\u001b[1;32m 263\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39moptim_progress\u001b[39m.\u001b[39moptimizer\u001b[39m.\u001b[39mstep\u001b[39m.\u001b[39mincrement_ready()\n\u001b[1;32m 265\u001b[0m \u001b[39m# model hook\u001b[39;00m\n\u001b[0;32m--> 266\u001b[0m call\u001b[39m.\u001b[39;49m_call_lightning_module_hook(\n\u001b[1;32m 267\u001b[0m trainer,\n\u001b[1;32m 268\u001b[0m \u001b[39m\"\u001b[39;49m\u001b[39moptimizer_step\u001b[39;49m\u001b[39m\"\u001b[39;49m,\n\u001b[1;32m 269\u001b[0m trainer\u001b[39m.\u001b[39;49mcurrent_epoch,\n\u001b[1;32m 270\u001b[0m batch_idx,\n\u001b[1;32m 271\u001b[0m optimizer,\n\u001b[1;32m 272\u001b[0m train_step_and_backward_closure,\n\u001b[1;32m 273\u001b[0m )\n\u001b[1;32m 275\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m should_accumulate:\n\u001b[1;32m 276\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39moptim_progress\u001b[39m.\u001b[39moptimizer\u001b[39m.\u001b[39mstep\u001b[39m.\u001b[39mincrement_completed()\n",
|
904 |
+
"File \u001b[0;32m~/anaconda3/envs/testenv/lib/python3.9/site-packages/pytorch_lightning/trainer/call.py:146\u001b[0m, in \u001b[0;36m_call_lightning_module_hook\u001b[0;34m(trainer, hook_name, pl_module, *args, **kwargs)\u001b[0m\n\u001b[1;32m 143\u001b[0m pl_module\u001b[39m.\u001b[39m_current_fx_name \u001b[39m=\u001b[39m hook_name\n\u001b[1;32m 145\u001b[0m \u001b[39mwith\u001b[39;00m trainer\u001b[39m.\u001b[39mprofiler\u001b[39m.\u001b[39mprofile(\u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m[LightningModule]\u001b[39m\u001b[39m{\u001b[39;00mpl_module\u001b[39m.\u001b[39m\u001b[39m__class__\u001b[39m\u001b[39m.\u001b[39m\u001b[39m__name__\u001b[39m\u001b[39m}\u001b[39;00m\u001b[39m.\u001b[39m\u001b[39m{\u001b[39;00mhook_name\u001b[39m}\u001b[39;00m\u001b[39m\"\u001b[39m):\n\u001b[0;32m--> 146\u001b[0m output \u001b[39m=\u001b[39m fn(\u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[1;32m 148\u001b[0m \u001b[39m# restore current_fx when nested context\u001b[39;00m\n\u001b[1;32m 149\u001b[0m pl_module\u001b[39m.\u001b[39m_current_fx_name \u001b[39m=\u001b[39m prev_fx_name\n",
|
905 |
+
"File \u001b[0;32m~/anaconda3/envs/testenv/lib/python3.9/site-packages/pytorch_lightning/core/module.py:1270\u001b[0m, in \u001b[0;36mLightningModule.optimizer_step\u001b[0;34m(self, epoch, batch_idx, optimizer, optimizer_closure)\u001b[0m\n\u001b[1;32m 1232\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39moptimizer_step\u001b[39m(\n\u001b[1;32m 1233\u001b[0m \u001b[39mself\u001b[39m,\n\u001b[1;32m 1234\u001b[0m epoch: \u001b[39mint\u001b[39m,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1237\u001b[0m optimizer_closure: Optional[Callable[[], Any]] \u001b[39m=\u001b[39m \u001b[39mNone\u001b[39;00m,\n\u001b[1;32m 1238\u001b[0m ) \u001b[39m-\u001b[39m\u001b[39m>\u001b[39m \u001b[39mNone\u001b[39;00m:\n\u001b[1;32m 1239\u001b[0m \u001b[39m \u001b[39m\u001b[39mr\u001b[39m\u001b[39m\"\"\"Override this method to adjust the default way the :class:`~pytorch_lightning.trainer.trainer.Trainer`\u001b[39;00m\n\u001b[1;32m 1240\u001b[0m \u001b[39m calls the optimizer.\u001b[39;00m\n\u001b[1;32m 1241\u001b[0m \n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1268\u001b[0m \u001b[39m pg[\"lr\"] = lr_scale * self.learning_rate\u001b[39;00m\n\u001b[1;32m 1269\u001b[0m \u001b[39m \"\"\"\u001b[39;00m\n\u001b[0;32m-> 1270\u001b[0m optimizer\u001b[39m.\u001b[39;49mstep(closure\u001b[39m=\u001b[39;49moptimizer_closure)\n",
|
906 |
+
"File \u001b[0;32m~/anaconda3/envs/testenv/lib/python3.9/site-packages/pytorch_lightning/core/optimizer.py:161\u001b[0m, in \u001b[0;36mLightningOptimizer.step\u001b[0;34m(self, closure, **kwargs)\u001b[0m\n\u001b[1;32m 158\u001b[0m \u001b[39mraise\u001b[39;00m MisconfigurationException(\u001b[39m\"\u001b[39m\u001b[39mWhen `optimizer.step(closure)` is called, the closure should be callable\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[1;32m 160\u001b[0m \u001b[39massert\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_strategy \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m\n\u001b[0;32m--> 161\u001b[0m step_output \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_strategy\u001b[39m.\u001b[39;49moptimizer_step(\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_optimizer, closure, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[1;32m 163\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_on_after_step()\n\u001b[1;32m 165\u001b[0m \u001b[39mreturn\u001b[39;00m step_output\n",
|
907 |
+
"File \u001b[0;32m~/anaconda3/envs/testenv/lib/python3.9/site-packages/pytorch_lightning/strategies/strategy.py:231\u001b[0m, in \u001b[0;36mStrategy.optimizer_step\u001b[0;34m(self, optimizer, closure, model, **kwargs)\u001b[0m\n\u001b[1;32m 229\u001b[0m \u001b[39m# TODO(fabric): remove assertion once strategy's optimizer_step typing is fixed\u001b[39;00m\n\u001b[1;32m 230\u001b[0m \u001b[39massert\u001b[39;00m \u001b[39misinstance\u001b[39m(model, pl\u001b[39m.\u001b[39mLightningModule)\n\u001b[0;32m--> 231\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mprecision_plugin\u001b[39m.\u001b[39;49moptimizer_step(optimizer, model\u001b[39m=\u001b[39;49mmodel, closure\u001b[39m=\u001b[39;49mclosure, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n",
|
908 |
+
"File \u001b[0;32m~/anaconda3/envs/testenv/lib/python3.9/site-packages/pytorch_lightning/plugins/precision/precision_plugin.py:116\u001b[0m, in \u001b[0;36mPrecisionPlugin.optimizer_step\u001b[0;34m(self, optimizer, model, closure, **kwargs)\u001b[0m\n\u001b[1;32m 114\u001b[0m \u001b[39m\u001b[39m\u001b[39m\"\"\"Hook to run the optimizer step.\"\"\"\u001b[39;00m\n\u001b[1;32m 115\u001b[0m closure \u001b[39m=\u001b[39m partial(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_wrap_closure, model, optimizer, closure)\n\u001b[0;32m--> 116\u001b[0m \u001b[39mreturn\u001b[39;00m optimizer\u001b[39m.\u001b[39;49mstep(closure\u001b[39m=\u001b[39;49mclosure, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n",
|
909 |
+
"File \u001b[0;32m~/anaconda3/envs/testenv/lib/python3.9/site-packages/torch/optim/lr_scheduler.py:69\u001b[0m, in \u001b[0;36mLRScheduler.__init__.<locals>.with_counter.<locals>.wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 67\u001b[0m instance\u001b[39m.\u001b[39m_step_count \u001b[39m+\u001b[39m\u001b[39m=\u001b[39m \u001b[39m1\u001b[39m\n\u001b[1;32m 68\u001b[0m wrapped \u001b[39m=\u001b[39m func\u001b[39m.\u001b[39m\u001b[39m__get__\u001b[39m(instance, \u001b[39mcls\u001b[39m)\n\u001b[0;32m---> 69\u001b[0m \u001b[39mreturn\u001b[39;00m wrapped(\u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n",
|
910 |
+
"File \u001b[0;32m~/anaconda3/envs/testenv/lib/python3.9/site-packages/torch/optim/optimizer.py:280\u001b[0m, in \u001b[0;36mOptimizer.profile_hook_step.<locals>.wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 276\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m 277\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mRuntimeError\u001b[39;00m(\u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m{\u001b[39;00mfunc\u001b[39m}\u001b[39;00m\u001b[39m must return None or a tuple of (new_args, new_kwargs),\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m 278\u001b[0m \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mbut got \u001b[39m\u001b[39m{\u001b[39;00mresult\u001b[39m}\u001b[39;00m\u001b[39m.\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[0;32m--> 280\u001b[0m out \u001b[39m=\u001b[39m func(\u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[1;32m 281\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_optimizer_step_code()\n\u001b[1;32m 283\u001b[0m \u001b[39m# call optimizer step post hooks\u001b[39;00m\n",
|
911 |
+
"File \u001b[0;32m~/anaconda3/envs/testenv/lib/python3.9/site-packages/torch/utils/_contextlib.py:115\u001b[0m, in \u001b[0;36mcontext_decorator.<locals>.decorate_context\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 112\u001b[0m \u001b[39m@functools\u001b[39m\u001b[39m.\u001b[39mwraps(func)\n\u001b[1;32m 113\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mdecorate_context\u001b[39m(\u001b[39m*\u001b[39margs, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs):\n\u001b[1;32m 114\u001b[0m \u001b[39mwith\u001b[39;00m ctx_factory():\n\u001b[0;32m--> 115\u001b[0m \u001b[39mreturn\u001b[39;00m func(\u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n",
|
912 |
+
"File \u001b[0;32m~/anaconda3/envs/testenv/lib/python3.9/site-packages/transformers/optimization.py:468\u001b[0m, in \u001b[0;36mAdamW.step\u001b[0;34m(self, closure)\u001b[0m\n\u001b[1;32m 466\u001b[0m exp_avg\u001b[39m.\u001b[39mmul_(beta1)\u001b[39m.\u001b[39madd_(grad, alpha\u001b[39m=\u001b[39m(\u001b[39m1.0\u001b[39m \u001b[39m-\u001b[39m beta1))\n\u001b[1;32m 467\u001b[0m exp_avg_sq\u001b[39m.\u001b[39mmul_(beta2)\u001b[39m.\u001b[39maddcmul_(grad, grad, value\u001b[39m=\u001b[39m\u001b[39m1.0\u001b[39m \u001b[39m-\u001b[39m beta2)\n\u001b[0;32m--> 468\u001b[0m denom \u001b[39m=\u001b[39m exp_avg_sq\u001b[39m.\u001b[39;49msqrt()\u001b[39m.\u001b[39madd_(group[\u001b[39m\"\u001b[39m\u001b[39meps\u001b[39m\u001b[39m\"\u001b[39m])\n\u001b[1;32m 470\u001b[0m step_size \u001b[39m=\u001b[39m group[\u001b[39m\"\u001b[39m\u001b[39mlr\u001b[39m\u001b[39m\"\u001b[39m]\n\u001b[1;32m 471\u001b[0m \u001b[39mif\u001b[39;00m group[\u001b[39m\"\u001b[39m\u001b[39mcorrect_bias\u001b[39m\u001b[39m\"\u001b[39m]: \u001b[39m# No bias correction for Bert\u001b[39;00m\n",
|
913 |
+
"\u001b[0;31mOutOfMemoryError\u001b[0m: CUDA out of memory. Tried to allocate 352.00 MiB (GPU 0; 4.00 GiB total capacity; 2.67 GiB already allocated; 0 bytes free; 2.80 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF"
|
914 |
+
]
|
915 |
+
}
|
916 |
+
],
|
917 |
+
"source": [
|
918 |
+
"model = sinKafModel(model, optimizer, scheduler)\n",
|
919 |
+
"trainer = pl.Trainer( max_epochs=epochs, limit_train_batches=0.1, devices=1, accelerator='gpu') \n",
|
920 |
+
"trainer.fit(model,train_dataloader,validation_dataloader )"
|
921 |
+
]
|
922 |
+
},
|
923 |
+
{
|
924 |
+
"cell_type": "code",
|
925 |
+
"execution_count": null,
|
926 |
+
"metadata": {},
|
927 |
+
"outputs": [],
|
928 |
+
"source": [
|
929 |
+
"sent = 'Koyunlar hasta'"
|
930 |
+
]
|
931 |
+
},
|
932 |
+
{
|
933 |
+
"cell_type": "code",
|
934 |
+
"execution_count": null,
|
935 |
+
"metadata": {},
|
936 |
+
"outputs": [],
|
937 |
+
"source": [
|
938 |
+
"input_ids = []\n",
|
939 |
+
"attention_masks = []\n",
|
940 |
+
"\n",
|
941 |
+
"encoded_dict = tokenizer.encode_plus(\n",
|
942 |
+
" sent,\n",
|
943 |
+
" add_special_tokens = True,\n",
|
944 |
+
" max_length = 64,\n",
|
945 |
+
" pad_to_max_length = True,\n",
|
946 |
+
" return_attention_mask = True,\n",
|
947 |
+
" return_tensors = 'pt',\n",
|
948 |
+
" )\n",
|
949 |
+
"\n",
|
950 |
+
"\n",
|
951 |
+
"input_ids = encoded_dict['input_ids']\n",
|
952 |
+
"attention_masks = encoded_dict['attention_mask']\n",
|
953 |
+
"\n",
|
954 |
+
"\n",
|
955 |
+
"input_ids = torch.cat([input_ids], dim=0)\n",
|
956 |
+
"input_mask = torch.cat([attention_masks], dim=0)\n",
|
957 |
+
"labels = torch.tensor(labels)\n",
|
958 |
+
"\n",
|
959 |
+
"\n",
|
960 |
+
"\n",
|
961 |
+
"\n",
|
962 |
+
"print('Original: ', sent)\n",
|
963 |
+
"print('Token IDs:', input_ids)\n",
|
964 |
+
"print('Token IDs:', input_mask)"
|
965 |
+
]
|
966 |
+
},
|
967 |
+
{
|
968 |
+
"cell_type": "code",
|
969 |
+
"execution_count": null,
|
970 |
+
"metadata": {},
|
971 |
+
"outputs": [],
|
972 |
+
"source": [
|
973 |
+
"outputs = model(input_ids, input_mask, labels[0])"
|
974 |
+
]
|
975 |
+
},
|
976 |
+
{
|
977 |
+
"cell_type": "code",
|
978 |
+
"execution_count": null,
|
979 |
+
"metadata": {},
|
980 |
+
"outputs": [],
|
981 |
+
"source": [
|
982 |
+
"outputs[0]"
|
983 |
+
]
|
984 |
+
}
|
985 |
+
],
|
986 |
+
"metadata": {
|
987 |
+
"kernelspec": {
|
988 |
+
"display_name": "sbtenv",
|
989 |
+
"language": "python",
|
990 |
+
"name": "python3"
|
991 |
+
},
|
992 |
+
"language_info": {
|
993 |
+
"codemirror_mode": {
|
994 |
+
"name": "ipython",
|
995 |
+
"version": 3
|
996 |
+
},
|
997 |
+
"file_extension": ".py",
|
998 |
+
"mimetype": "text/x-python",
|
999 |
+
"name": "python",
|
1000 |
+
"nbconvert_exporter": "python",
|
1001 |
+
"pygments_lexer": "ipython3",
|
1002 |
+
"version": "3.9.0"
|
1003 |
+
},
|
1004 |
+
"orig_nbformat": 4
|
1005 |
+
},
|
1006 |
+
"nbformat": 4,
|
1007 |
+
"nbformat_minor": 2
|
1008 |
+
}
|
onnx_model/config.json
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "/DATA/sin-kaf/test_trainer/checkpoint-18500/config.json",
|
3 |
+
"activation": "gelu",
|
4 |
+
"architectures": [
|
5 |
+
"DistilBertForSequenceClassification"
|
6 |
+
],
|
7 |
+
"attention_dropout": 0.1,
|
8 |
+
"attention_probs_dropout_prob": 0.1,
|
9 |
+
"dim": 768,
|
10 |
+
"dropout": 0.1,
|
11 |
+
"hidden_dim": 3072,
|
12 |
+
"initializer_range": 0.02,
|
13 |
+
"max_position_embeddings": 512,
|
14 |
+
"model_type": "distilbert",
|
15 |
+
"n_heads": 12,
|
16 |
+
"n_layers": 6,
|
17 |
+
"output_past": true,
|
18 |
+
"pad_token_id": 0,
|
19 |
+
"problem_type": "single_label_classification",
|
20 |
+
"qa_dropout": 0.1,
|
21 |
+
"seq_classif_dropout": 0.2,
|
22 |
+
"sinusoidal_pos_embds": true,
|
23 |
+
"tie_weights_": true,
|
24 |
+
"torch_dtype": "float32",
|
25 |
+
"transformers_version": "4.34.1",
|
26 |
+
"vocab_size": 32000
|
27 |
+
}
|
onnx_model/model.onnx
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:cee6ddb2c1e1abb21e513881265239a57dd3cba52f621b6c81a78e41e66eae09
|
3 |
+
size 272496128
|
outlier_detection.ipynb
ADDED
@@ -0,0 +1,2292 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": 1,
|
6 |
+
"metadata": {},
|
7 |
+
"outputs": [],
|
8 |
+
"source": [
|
9 |
+
"import re\n",
|
10 |
+
"import tqdm\n",
|
11 |
+
"import numpy as np\n",
|
12 |
+
"import pandas as pd\n",
|
13 |
+
"import matplotlib.pyplot as plt\n",
|
14 |
+
"import seaborn as sns\n",
|
15 |
+
"\n",
|
16 |
+
"from sklearn.datasets import fetch_20newsgroups\n",
|
17 |
+
"from sklearn.manifold import TSNE\n"
|
18 |
+
]
|
19 |
+
},
|
20 |
+
{
|
21 |
+
"cell_type": "code",
|
22 |
+
"execution_count": 2,
|
23 |
+
"metadata": {},
|
24 |
+
"outputs": [],
|
25 |
+
"source": [
|
26 |
+
"embeding_df=pd.read_csv('/mnt/c/Users/selin_uzturk/Desktop/sinkaf/encoded.csv')\n",
|
27 |
+
"embeding_df=embeding_df.drop(['Unnamed: 0'], axis=1)\n",
|
28 |
+
"copy_df=pd.read_csv('/mnt/c/Users/selin_uzturk/Desktop/sinkaf/encoded.csv')\n",
|
29 |
+
"copy_df=copy_df.drop(['Unnamed: 0'], axis=1)\n"
|
30 |
+
]
|
31 |
+
},
|
32 |
+
{
|
33 |
+
"cell_type": "code",
|
34 |
+
"execution_count": 3,
|
35 |
+
"metadata": {},
|
36 |
+
"outputs": [
|
37 |
+
{
|
38 |
+
"data": {
|
39 |
+
"text/html": [
|
40 |
+
"<div>\n",
|
41 |
+
"<style scoped>\n",
|
42 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
43 |
+
" vertical-align: middle;\n",
|
44 |
+
" }\n",
|
45 |
+
"\n",
|
46 |
+
" .dataframe tbody tr th {\n",
|
47 |
+
" vertical-align: top;\n",
|
48 |
+
" }\n",
|
49 |
+
"\n",
|
50 |
+
" .dataframe thead th {\n",
|
51 |
+
" text-align: right;\n",
|
52 |
+
" }\n",
|
53 |
+
"</style>\n",
|
54 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
55 |
+
" <thead>\n",
|
56 |
+
" <tr style=\"text-align: right;\">\n",
|
57 |
+
" <th></th>\n",
|
58 |
+
" <th>0</th>\n",
|
59 |
+
" <th>1</th>\n",
|
60 |
+
" <th>2</th>\n",
|
61 |
+
" <th>3</th>\n",
|
62 |
+
" <th>4</th>\n",
|
63 |
+
" <th>5</th>\n",
|
64 |
+
" <th>6</th>\n",
|
65 |
+
" <th>7</th>\n",
|
66 |
+
" <th>8</th>\n",
|
67 |
+
" <th>9</th>\n",
|
68 |
+
" <th>...</th>\n",
|
69 |
+
" <th>56</th>\n",
|
70 |
+
" <th>57</th>\n",
|
71 |
+
" <th>58</th>\n",
|
72 |
+
" <th>59</th>\n",
|
73 |
+
" <th>60</th>\n",
|
74 |
+
" <th>61</th>\n",
|
75 |
+
" <th>62</th>\n",
|
76 |
+
" <th>63</th>\n",
|
77 |
+
" <th>labels</th>\n",
|
78 |
+
" <th>tweet</th>\n",
|
79 |
+
" </tr>\n",
|
80 |
+
" </thead>\n",
|
81 |
+
" <tbody>\n",
|
82 |
+
" <tr>\n",
|
83 |
+
" <th>0</th>\n",
|
84 |
+
" <td>101</td>\n",
|
85 |
+
" <td>10110</td>\n",
|
86 |
+
" <td>175</td>\n",
|
87 |
+
" <td>78653</td>\n",
|
88 |
+
" <td>189</td>\n",
|
89 |
+
" <td>25285</td>\n",
|
90 |
+
" <td>15976</td>\n",
|
91 |
+
" <td>40840</td>\n",
|
92 |
+
" <td>276</td>\n",
|
93 |
+
" <td>31623</td>\n",
|
94 |
+
" <td>...</td>\n",
|
95 |
+
" <td>0</td>\n",
|
96 |
+
" <td>0</td>\n",
|
97 |
+
" <td>0</td>\n",
|
98 |
+
" <td>0</td>\n",
|
99 |
+
" <td>0</td>\n",
|
100 |
+
" <td>0</td>\n",
|
101 |
+
" <td>0</td>\n",
|
102 |
+
" <td>0</td>\n",
|
103 |
+
" <td>0</td>\n",
|
104 |
+
" <td>en güzel uyuyan insan ödülü jeon jungkook'a g...</td>\n",
|
105 |
+
" </tr>\n",
|
106 |
+
" <tr>\n",
|
107 |
+
" <th>1</th>\n",
|
108 |
+
" <td>101</td>\n",
|
109 |
+
" <td>11589</td>\n",
|
110 |
+
" <td>10706</td>\n",
|
111 |
+
" <td>10713</td>\n",
|
112 |
+
" <td>10794</td>\n",
|
113 |
+
" <td>94698</td>\n",
|
114 |
+
" <td>30668</td>\n",
|
115 |
+
" <td>24883</td>\n",
|
116 |
+
" <td>117</td>\n",
|
117 |
+
" <td>23763</td>\n",
|
118 |
+
" <td>...</td>\n",
|
119 |
+
" <td>0</td>\n",
|
120 |
+
" <td>0</td>\n",
|
121 |
+
" <td>0</td>\n",
|
122 |
+
" <td>0</td>\n",
|
123 |
+
" <td>0</td>\n",
|
124 |
+
" <td>0</td>\n",
|
125 |
+
" <td>0</td>\n",
|
126 |
+
" <td>0</td>\n",
|
127 |
+
" <td>0</td>\n",
|
128 |
+
" <td>Mekanı cennet olsun, saygılar sayın avukatımı...</td>\n",
|
129 |
+
" </tr>\n",
|
130 |
+
" <tr>\n",
|
131 |
+
" <th>2</th>\n",
|
132 |
+
" <td>101</td>\n",
|
133 |
+
" <td>148</td>\n",
|
134 |
+
" <td>30471</td>\n",
|
135 |
+
" <td>10774</td>\n",
|
136 |
+
" <td>13785</td>\n",
|
137 |
+
" <td>13779</td>\n",
|
138 |
+
" <td>33642</td>\n",
|
139 |
+
" <td>14399</td>\n",
|
140 |
+
" <td>48271</td>\n",
|
141 |
+
" <td>76686</td>\n",
|
142 |
+
" <td>...</td>\n",
|
143 |
+
" <td>0</td>\n",
|
144 |
+
" <td>0</td>\n",
|
145 |
+
" <td>0</td>\n",
|
146 |
+
" <td>0</td>\n",
|
147 |
+
" <td>0</td>\n",
|
148 |
+
" <td>0</td>\n",
|
149 |
+
" <td>0</td>\n",
|
150 |
+
" <td>0</td>\n",
|
151 |
+
" <td>0</td>\n",
|
152 |
+
" <td>Kızlar aranızda kas yığını beylere düşenler ol...</td>\n",
|
153 |
+
" </tr>\n",
|
154 |
+
" <tr>\n",
|
155 |
+
" <th>3</th>\n",
|
156 |
+
" <td>101</td>\n",
|
157 |
+
" <td>19319</td>\n",
|
158 |
+
" <td>16724</td>\n",
|
159 |
+
" <td>10118</td>\n",
|
160 |
+
" <td>10107</td>\n",
|
161 |
+
" <td>78323</td>\n",
|
162 |
+
" <td>12407</td>\n",
|
163 |
+
" <td>38959</td>\n",
|
164 |
+
" <td>22934</td>\n",
|
165 |
+
" <td>10147</td>\n",
|
166 |
+
" <td>...</td>\n",
|
167 |
+
" <td>0</td>\n",
|
168 |
+
" <td>0</td>\n",
|
169 |
+
" <td>0</td>\n",
|
170 |
+
" <td>0</td>\n",
|
171 |
+
" <td>0</td>\n",
|
172 |
+
" <td>0</td>\n",
|
173 |
+
" <td>0</td>\n",
|
174 |
+
" <td>0</td>\n",
|
175 |
+
" <td>0</td>\n",
|
176 |
+
" <td>Biraz ders çalışayım. Tembellik ve uyku düşman...</td>\n",
|
177 |
+
" </tr>\n",
|
178 |
+
" <tr>\n",
|
179 |
+
" <th>4</th>\n",
|
180 |
+
" <td>101</td>\n",
|
181 |
+
" <td>30932</td>\n",
|
182 |
+
" <td>58706</td>\n",
|
183 |
+
" <td>58054</td>\n",
|
184 |
+
" <td>44907</td>\n",
|
185 |
+
" <td>10224</td>\n",
|
186 |
+
" <td>106583</td>\n",
|
187 |
+
" <td>10288</td>\n",
|
188 |
+
" <td>12524</td>\n",
|
189 |
+
" <td>13878</td>\n",
|
190 |
+
" <td>...</td>\n",
|
191 |
+
" <td>0</td>\n",
|
192 |
+
" <td>0</td>\n",
|
193 |
+
" <td>0</td>\n",
|
194 |
+
" <td>0</td>\n",
|
195 |
+
" <td>0</td>\n",
|
196 |
+
" <td>0</td>\n",
|
197 |
+
" <td>0</td>\n",
|
198 |
+
" <td>0</td>\n",
|
199 |
+
" <td>0</td>\n",
|
200 |
+
" <td>Trezeguet yerine El Sharawy daha iyi olmaz mı</td>\n",
|
201 |
+
" </tr>\n",
|
202 |
+
" <tr>\n",
|
203 |
+
" <th>...</th>\n",
|
204 |
+
" <td>...</td>\n",
|
205 |
+
" <td>...</td>\n",
|
206 |
+
" <td>...</td>\n",
|
207 |
+
" <td>...</td>\n",
|
208 |
+
" <td>...</td>\n",
|
209 |
+
" <td>...</td>\n",
|
210 |
+
" <td>...</td>\n",
|
211 |
+
" <td>...</td>\n",
|
212 |
+
" <td>...</td>\n",
|
213 |
+
" <td>...</td>\n",
|
214 |
+
" <td>...</td>\n",
|
215 |
+
" <td>...</td>\n",
|
216 |
+
" <td>...</td>\n",
|
217 |
+
" <td>...</td>\n",
|
218 |
+
" <td>...</td>\n",
|
219 |
+
" <td>...</td>\n",
|
220 |
+
" <td>...</td>\n",
|
221 |
+
" <td>...</td>\n",
|
222 |
+
" <td>...</td>\n",
|
223 |
+
" <td>...</td>\n",
|
224 |
+
" <td>...</td>\n",
|
225 |
+
" </tr>\n",
|
226 |
+
" <tr>\n",
|
227 |
+
" <th>43344</th>\n",
|
228 |
+
" <td>101</td>\n",
|
229 |
+
" <td>20065</td>\n",
|
230 |
+
" <td>10161</td>\n",
|
231 |
+
" <td>115</td>\n",
|
232 |
+
" <td>115</td>\n",
|
233 |
+
" <td>103784</td>\n",
|
234 |
+
" <td>10774</td>\n",
|
235 |
+
" <td>21388</td>\n",
|
236 |
+
" <td>10245</td>\n",
|
237 |
+
" <td>92067</td>\n",
|
238 |
+
" <td>...</td>\n",
|
239 |
+
" <td>0</td>\n",
|
240 |
+
" <td>0</td>\n",
|
241 |
+
" <td>0</td>\n",
|
242 |
+
" <td>0</td>\n",
|
243 |
+
" <td>0</td>\n",
|
244 |
+
" <td>0</td>\n",
|
245 |
+
" <td>0</td>\n",
|
246 |
+
" <td>0</td>\n",
|
247 |
+
" <td>1</td>\n",
|
248 |
+
" <td>Hil**adamlar kesinlikle kelimeleri anlamıyorla...</td>\n",
|
249 |
+
" </tr>\n",
|
250 |
+
" <tr>\n",
|
251 |
+
" <th>43345</th>\n",
|
252 |
+
" <td>101</td>\n",
|
253 |
+
" <td>139</td>\n",
|
254 |
+
" <td>80839</td>\n",
|
255 |
+
" <td>24109</td>\n",
|
256 |
+
" <td>13406</td>\n",
|
257 |
+
" <td>18985</td>\n",
|
258 |
+
" <td>16285</td>\n",
|
259 |
+
" <td>10163</td>\n",
|
260 |
+
" <td>11062</td>\n",
|
261 |
+
" <td>276</td>\n",
|
262 |
+
" <td>...</td>\n",
|
263 |
+
" <td>0</td>\n",
|
264 |
+
" <td>0</td>\n",
|
265 |
+
" <td>0</td>\n",
|
266 |
+
" <td>0</td>\n",
|
267 |
+
" <td>0</td>\n",
|
268 |
+
" <td>0</td>\n",
|
269 |
+
" <td>0</td>\n",
|
270 |
+
" <td>0</td>\n",
|
271 |
+
" <td>1</td>\n",
|
272 |
+
" <td>Böyle piçlerin çok erken ölmemelerini ve çok f...</td>\n",
|
273 |
+
" </tr>\n",
|
274 |
+
" <tr>\n",
|
275 |
+
" <th>43346</th>\n",
|
276 |
+
" <td>101</td>\n",
|
277 |
+
" <td>105549</td>\n",
|
278 |
+
" <td>102635</td>\n",
|
279 |
+
" <td>10140</td>\n",
|
280 |
+
" <td>26943</td>\n",
|
281 |
+
" <td>11499</td>\n",
|
282 |
+
" <td>110516</td>\n",
|
283 |
+
" <td>21899</td>\n",
|
284 |
+
" <td>11861</td>\n",
|
285 |
+
" <td>10561</td>\n",
|
286 |
+
" <td>...</td>\n",
|
287 |
+
" <td>0</td>\n",
|
288 |
+
" <td>0</td>\n",
|
289 |
+
" <td>0</td>\n",
|
290 |
+
" <td>0</td>\n",
|
291 |
+
" <td>0</td>\n",
|
292 |
+
" <td>0</td>\n",
|
293 |
+
" <td>0</td>\n",
|
294 |
+
" <td>0</td>\n",
|
295 |
+
" <td>1</td>\n",
|
296 |
+
" <td>Turgay denilen bu holigonda bir sorun yok, gur...</td>\n",
|
297 |
+
" </tr>\n",
|
298 |
+
" <tr>\n",
|
299 |
+
" <th>43347</th>\n",
|
300 |
+
" <td>101</td>\n",
|
301 |
+
" <td>81424</td>\n",
|
302 |
+
" <td>26398</td>\n",
|
303 |
+
" <td>92017</td>\n",
|
304 |
+
" <td>109620</td>\n",
|
305 |
+
" <td>10941</td>\n",
|
306 |
+
" <td>76010</td>\n",
|
307 |
+
" <td>10115</td>\n",
|
308 |
+
" <td>19830</td>\n",
|
309 |
+
" <td>26083</td>\n",
|
310 |
+
" <td>...</td>\n",
|
311 |
+
" <td>0</td>\n",
|
312 |
+
" <td>0</td>\n",
|
313 |
+
" <td>0</td>\n",
|
314 |
+
" <td>0</td>\n",
|
315 |
+
" <td>0</td>\n",
|
316 |
+
" <td>0</td>\n",
|
317 |
+
" <td>0</td>\n",
|
318 |
+
" <td>0</td>\n",
|
319 |
+
" <td>1</td>\n",
|
320 |
+
" <td>Umarım ülkenin düşük zekadan kurtulması ilgile...</td>\n",
|
321 |
+
" </tr>\n",
|
322 |
+
" <tr>\n",
|
323 |
+
" <th>43348</th>\n",
|
324 |
+
" <td>101</td>\n",
|
325 |
+
" <td>39774</td>\n",
|
326 |
+
" <td>11127</td>\n",
|
327 |
+
" <td>45989</td>\n",
|
328 |
+
" <td>24596</td>\n",
|
329 |
+
" <td>11933</td>\n",
|
330 |
+
" <td>170</td>\n",
|
331 |
+
" <td>17145</td>\n",
|
332 |
+
" <td>10710</td>\n",
|
333 |
+
" <td>39125</td>\n",
|
334 |
+
" <td>...</td>\n",
|
335 |
+
" <td>0</td>\n",
|
336 |
+
" <td>0</td>\n",
|
337 |
+
" <td>0</td>\n",
|
338 |
+
" <td>0</td>\n",
|
339 |
+
" <td>0</td>\n",
|
340 |
+
" <td>0</td>\n",
|
341 |
+
" <td>0</td>\n",
|
342 |
+
" <td>0</td>\n",
|
343 |
+
" <td>1</td>\n",
|
344 |
+
" <td>CHP sandıkları bırakmaz, üzerine oturur, bir c...</td>\n",
|
345 |
+
" </tr>\n",
|
346 |
+
" </tbody>\n",
|
347 |
+
"</table>\n",
|
348 |
+
"<p>43349 rows × 66 columns</p>\n",
|
349 |
+
"</div>"
|
350 |
+
],
|
351 |
+
"text/plain": [
|
352 |
+
" 0 1 2 3 4 5 6 7 8 \n",
|
353 |
+
"0 101 10110 175 78653 189 25285 15976 40840 276 \\\n",
|
354 |
+
"1 101 11589 10706 10713 10794 94698 30668 24883 117 \n",
|
355 |
+
"2 101 148 30471 10774 13785 13779 33642 14399 48271 \n",
|
356 |
+
"3 101 19319 16724 10118 10107 78323 12407 38959 22934 \n",
|
357 |
+
"4 101 30932 58706 58054 44907 10224 106583 10288 12524 \n",
|
358 |
+
"... ... ... ... ... ... ... ... ... ... \n",
|
359 |
+
"43344 101 20065 10161 115 115 103784 10774 21388 10245 \n",
|
360 |
+
"43345 101 139 80839 24109 13406 18985 16285 10163 11062 \n",
|
361 |
+
"43346 101 105549 102635 10140 26943 11499 110516 21899 11861 \n",
|
362 |
+
"43347 101 81424 26398 92017 109620 10941 76010 10115 19830 \n",
|
363 |
+
"43348 101 39774 11127 45989 24596 11933 170 17145 10710 \n",
|
364 |
+
"\n",
|
365 |
+
" 9 ... 56 57 58 59 60 61 62 63 labels \n",
|
366 |
+
"0 31623 ... 0 0 0 0 0 0 0 0 0 \\\n",
|
367 |
+
"1 23763 ... 0 0 0 0 0 0 0 0 0 \n",
|
368 |
+
"2 76686 ... 0 0 0 0 0 0 0 0 0 \n",
|
369 |
+
"3 10147 ... 0 0 0 0 0 0 0 0 0 \n",
|
370 |
+
"4 13878 ... 0 0 0 0 0 0 0 0 0 \n",
|
371 |
+
"... ... ... .. .. .. .. .. .. .. .. ... \n",
|
372 |
+
"43344 92067 ... 0 0 0 0 0 0 0 0 1 \n",
|
373 |
+
"43345 276 ... 0 0 0 0 0 0 0 0 1 \n",
|
374 |
+
"43346 10561 ... 0 0 0 0 0 0 0 0 1 \n",
|
375 |
+
"43347 26083 ... 0 0 0 0 0 0 0 0 1 \n",
|
376 |
+
"43348 39125 ... 0 0 0 0 0 0 0 0 1 \n",
|
377 |
+
"\n",
|
378 |
+
" tweet \n",
|
379 |
+
"0 en güzel uyuyan insan ödülü jeon jungkook'a g... \n",
|
380 |
+
"1 Mekanı cennet olsun, saygılar sayın avukatımı... \n",
|
381 |
+
"2 Kızlar aranızda kas yığını beylere düşenler ol... \n",
|
382 |
+
"3 Biraz ders çalışayım. Tembellik ve uyku düşman... \n",
|
383 |
+
"4 Trezeguet yerine El Sharawy daha iyi olmaz mı \n",
|
384 |
+
"... ... \n",
|
385 |
+
"43344 Hil**adamlar kesinlikle kelimeleri anlamıyorla... \n",
|
386 |
+
"43345 Böyle piçlerin çok erken ölmemelerini ve çok f... \n",
|
387 |
+
"43346 Turgay denilen bu holigonda bir sorun yok, gur... \n",
|
388 |
+
"43347 Umarım ülkenin düşük zekadan kurtulması ilgile... \n",
|
389 |
+
"43348 CHP sandıkları bırakmaz, üzerine oturur, bir c... \n",
|
390 |
+
"\n",
|
391 |
+
"[43349 rows x 66 columns]"
|
392 |
+
]
|
393 |
+
},
|
394 |
+
"execution_count": 3,
|
395 |
+
"metadata": {},
|
396 |
+
"output_type": "execute_result"
|
397 |
+
}
|
398 |
+
],
|
399 |
+
"source": [
|
400 |
+
"copy_df"
|
401 |
+
]
|
402 |
+
},
|
403 |
+
{
|
404 |
+
"cell_type": "code",
|
405 |
+
"execution_count": 4,
|
406 |
+
"metadata": {},
|
407 |
+
"outputs": [
|
408 |
+
{
|
409 |
+
"data": {
|
410 |
+
"text/html": [
|
411 |
+
"<div>\n",
|
412 |
+
"<style scoped>\n",
|
413 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
414 |
+
" vertical-align: middle;\n",
|
415 |
+
" }\n",
|
416 |
+
"\n",
|
417 |
+
" .dataframe tbody tr th {\n",
|
418 |
+
" vertical-align: top;\n",
|
419 |
+
" }\n",
|
420 |
+
"\n",
|
421 |
+
" .dataframe thead th {\n",
|
422 |
+
" text-align: right;\n",
|
423 |
+
" }\n",
|
424 |
+
"</style>\n",
|
425 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
426 |
+
" <thead>\n",
|
427 |
+
" <tr style=\"text-align: right;\">\n",
|
428 |
+
" <th></th>\n",
|
429 |
+
" <th>0</th>\n",
|
430 |
+
" <th>1</th>\n",
|
431 |
+
" <th>2</th>\n",
|
432 |
+
" <th>3</th>\n",
|
433 |
+
" <th>4</th>\n",
|
434 |
+
" <th>5</th>\n",
|
435 |
+
" <th>6</th>\n",
|
436 |
+
" <th>7</th>\n",
|
437 |
+
" <th>8</th>\n",
|
438 |
+
" <th>9</th>\n",
|
439 |
+
" <th>...</th>\n",
|
440 |
+
" <th>56</th>\n",
|
441 |
+
" <th>57</th>\n",
|
442 |
+
" <th>58</th>\n",
|
443 |
+
" <th>59</th>\n",
|
444 |
+
" <th>60</th>\n",
|
445 |
+
" <th>61</th>\n",
|
446 |
+
" <th>62</th>\n",
|
447 |
+
" <th>63</th>\n",
|
448 |
+
" <th>labels</th>\n",
|
449 |
+
" <th>tweet</th>\n",
|
450 |
+
" </tr>\n",
|
451 |
+
" </thead>\n",
|
452 |
+
" <tbody>\n",
|
453 |
+
" <tr>\n",
|
454 |
+
" <th>0</th>\n",
|
455 |
+
" <td>101</td>\n",
|
456 |
+
" <td>10110</td>\n",
|
457 |
+
" <td>175</td>\n",
|
458 |
+
" <td>78653</td>\n",
|
459 |
+
" <td>189</td>\n",
|
460 |
+
" <td>25285</td>\n",
|
461 |
+
" <td>15976</td>\n",
|
462 |
+
" <td>40840</td>\n",
|
463 |
+
" <td>276</td>\n",
|
464 |
+
" <td>31623</td>\n",
|
465 |
+
" <td>...</td>\n",
|
466 |
+
" <td>0</td>\n",
|
467 |
+
" <td>0</td>\n",
|
468 |
+
" <td>0</td>\n",
|
469 |
+
" <td>0</td>\n",
|
470 |
+
" <td>0</td>\n",
|
471 |
+
" <td>0</td>\n",
|
472 |
+
" <td>0</td>\n",
|
473 |
+
" <td>0</td>\n",
|
474 |
+
" <td>0</td>\n",
|
475 |
+
" <td>en güzel uyuyan insan ödülü jeon jungkook'a g...</td>\n",
|
476 |
+
" </tr>\n",
|
477 |
+
" <tr>\n",
|
478 |
+
" <th>1</th>\n",
|
479 |
+
" <td>101</td>\n",
|
480 |
+
" <td>11589</td>\n",
|
481 |
+
" <td>10706</td>\n",
|
482 |
+
" <td>10713</td>\n",
|
483 |
+
" <td>10794</td>\n",
|
484 |
+
" <td>94698</td>\n",
|
485 |
+
" <td>30668</td>\n",
|
486 |
+
" <td>24883</td>\n",
|
487 |
+
" <td>117</td>\n",
|
488 |
+
" <td>23763</td>\n",
|
489 |
+
" <td>...</td>\n",
|
490 |
+
" <td>0</td>\n",
|
491 |
+
" <td>0</td>\n",
|
492 |
+
" <td>0</td>\n",
|
493 |
+
" <td>0</td>\n",
|
494 |
+
" <td>0</td>\n",
|
495 |
+
" <td>0</td>\n",
|
496 |
+
" <td>0</td>\n",
|
497 |
+
" <td>0</td>\n",
|
498 |
+
" <td>0</td>\n",
|
499 |
+
" <td>Mekanı cennet olsun, saygılar sayın avukatımı...</td>\n",
|
500 |
+
" </tr>\n",
|
501 |
+
" <tr>\n",
|
502 |
+
" <th>2</th>\n",
|
503 |
+
" <td>101</td>\n",
|
504 |
+
" <td>148</td>\n",
|
505 |
+
" <td>30471</td>\n",
|
506 |
+
" <td>10774</td>\n",
|
507 |
+
" <td>13785</td>\n",
|
508 |
+
" <td>13779</td>\n",
|
509 |
+
" <td>33642</td>\n",
|
510 |
+
" <td>14399</td>\n",
|
511 |
+
" <td>48271</td>\n",
|
512 |
+
" <td>76686</td>\n",
|
513 |
+
" <td>...</td>\n",
|
514 |
+
" <td>0</td>\n",
|
515 |
+
" <td>0</td>\n",
|
516 |
+
" <td>0</td>\n",
|
517 |
+
" <td>0</td>\n",
|
518 |
+
" <td>0</td>\n",
|
519 |
+
" <td>0</td>\n",
|
520 |
+
" <td>0</td>\n",
|
521 |
+
" <td>0</td>\n",
|
522 |
+
" <td>0</td>\n",
|
523 |
+
" <td>Kızlar aranızda kas yığını beylere düşenler ol...</td>\n",
|
524 |
+
" </tr>\n",
|
525 |
+
" <tr>\n",
|
526 |
+
" <th>3</th>\n",
|
527 |
+
" <td>101</td>\n",
|
528 |
+
" <td>19319</td>\n",
|
529 |
+
" <td>16724</td>\n",
|
530 |
+
" <td>10118</td>\n",
|
531 |
+
" <td>10107</td>\n",
|
532 |
+
" <td>78323</td>\n",
|
533 |
+
" <td>12407</td>\n",
|
534 |
+
" <td>38959</td>\n",
|
535 |
+
" <td>22934</td>\n",
|
536 |
+
" <td>10147</td>\n",
|
537 |
+
" <td>...</td>\n",
|
538 |
+
" <td>0</td>\n",
|
539 |
+
" <td>0</td>\n",
|
540 |
+
" <td>0</td>\n",
|
541 |
+
" <td>0</td>\n",
|
542 |
+
" <td>0</td>\n",
|
543 |
+
" <td>0</td>\n",
|
544 |
+
" <td>0</td>\n",
|
545 |
+
" <td>0</td>\n",
|
546 |
+
" <td>0</td>\n",
|
547 |
+
" <td>Biraz ders çalışayım. Tembellik ve uyku düşman...</td>\n",
|
548 |
+
" </tr>\n",
|
549 |
+
" <tr>\n",
|
550 |
+
" <th>4</th>\n",
|
551 |
+
" <td>101</td>\n",
|
552 |
+
" <td>30932</td>\n",
|
553 |
+
" <td>58706</td>\n",
|
554 |
+
" <td>58054</td>\n",
|
555 |
+
" <td>44907</td>\n",
|
556 |
+
" <td>10224</td>\n",
|
557 |
+
" <td>106583</td>\n",
|
558 |
+
" <td>10288</td>\n",
|
559 |
+
" <td>12524</td>\n",
|
560 |
+
" <td>13878</td>\n",
|
561 |
+
" <td>...</td>\n",
|
562 |
+
" <td>0</td>\n",
|
563 |
+
" <td>0</td>\n",
|
564 |
+
" <td>0</td>\n",
|
565 |
+
" <td>0</td>\n",
|
566 |
+
" <td>0</td>\n",
|
567 |
+
" <td>0</td>\n",
|
568 |
+
" <td>0</td>\n",
|
569 |
+
" <td>0</td>\n",
|
570 |
+
" <td>0</td>\n",
|
571 |
+
" <td>Trezeguet yerine El Sharawy daha iyi olmaz mı</td>\n",
|
572 |
+
" </tr>\n",
|
573 |
+
" <tr>\n",
|
574 |
+
" <th>...</th>\n",
|
575 |
+
" <td>...</td>\n",
|
576 |
+
" <td>...</td>\n",
|
577 |
+
" <td>...</td>\n",
|
578 |
+
" <td>...</td>\n",
|
579 |
+
" <td>...</td>\n",
|
580 |
+
" <td>...</td>\n",
|
581 |
+
" <td>...</td>\n",
|
582 |
+
" <td>...</td>\n",
|
583 |
+
" <td>...</td>\n",
|
584 |
+
" <td>...</td>\n",
|
585 |
+
" <td>...</td>\n",
|
586 |
+
" <td>...</td>\n",
|
587 |
+
" <td>...</td>\n",
|
588 |
+
" <td>...</td>\n",
|
589 |
+
" <td>...</td>\n",
|
590 |
+
" <td>...</td>\n",
|
591 |
+
" <td>...</td>\n",
|
592 |
+
" <td>...</td>\n",
|
593 |
+
" <td>...</td>\n",
|
594 |
+
" <td>...</td>\n",
|
595 |
+
" <td>...</td>\n",
|
596 |
+
" </tr>\n",
|
597 |
+
" <tr>\n",
|
598 |
+
" <th>43344</th>\n",
|
599 |
+
" <td>101</td>\n",
|
600 |
+
" <td>20065</td>\n",
|
601 |
+
" <td>10161</td>\n",
|
602 |
+
" <td>115</td>\n",
|
603 |
+
" <td>115</td>\n",
|
604 |
+
" <td>103784</td>\n",
|
605 |
+
" <td>10774</td>\n",
|
606 |
+
" <td>21388</td>\n",
|
607 |
+
" <td>10245</td>\n",
|
608 |
+
" <td>92067</td>\n",
|
609 |
+
" <td>...</td>\n",
|
610 |
+
" <td>0</td>\n",
|
611 |
+
" <td>0</td>\n",
|
612 |
+
" <td>0</td>\n",
|
613 |
+
" <td>0</td>\n",
|
614 |
+
" <td>0</td>\n",
|
615 |
+
" <td>0</td>\n",
|
616 |
+
" <td>0</td>\n",
|
617 |
+
" <td>0</td>\n",
|
618 |
+
" <td>1</td>\n",
|
619 |
+
" <td>Hil**adamlar kesinlikle kelimeleri anlamıyorla...</td>\n",
|
620 |
+
" </tr>\n",
|
621 |
+
" <tr>\n",
|
622 |
+
" <th>43345</th>\n",
|
623 |
+
" <td>101</td>\n",
|
624 |
+
" <td>139</td>\n",
|
625 |
+
" <td>80839</td>\n",
|
626 |
+
" <td>24109</td>\n",
|
627 |
+
" <td>13406</td>\n",
|
628 |
+
" <td>18985</td>\n",
|
629 |
+
" <td>16285</td>\n",
|
630 |
+
" <td>10163</td>\n",
|
631 |
+
" <td>11062</td>\n",
|
632 |
+
" <td>276</td>\n",
|
633 |
+
" <td>...</td>\n",
|
634 |
+
" <td>0</td>\n",
|
635 |
+
" <td>0</td>\n",
|
636 |
+
" <td>0</td>\n",
|
637 |
+
" <td>0</td>\n",
|
638 |
+
" <td>0</td>\n",
|
639 |
+
" <td>0</td>\n",
|
640 |
+
" <td>0</td>\n",
|
641 |
+
" <td>0</td>\n",
|
642 |
+
" <td>1</td>\n",
|
643 |
+
" <td>Böyle piçlerin çok erken ölmemelerini ve çok f...</td>\n",
|
644 |
+
" </tr>\n",
|
645 |
+
" <tr>\n",
|
646 |
+
" <th>43346</th>\n",
|
647 |
+
" <td>101</td>\n",
|
648 |
+
" <td>105549</td>\n",
|
649 |
+
" <td>102635</td>\n",
|
650 |
+
" <td>10140</td>\n",
|
651 |
+
" <td>26943</td>\n",
|
652 |
+
" <td>11499</td>\n",
|
653 |
+
" <td>110516</td>\n",
|
654 |
+
" <td>21899</td>\n",
|
655 |
+
" <td>11861</td>\n",
|
656 |
+
" <td>10561</td>\n",
|
657 |
+
" <td>...</td>\n",
|
658 |
+
" <td>0</td>\n",
|
659 |
+
" <td>0</td>\n",
|
660 |
+
" <td>0</td>\n",
|
661 |
+
" <td>0</td>\n",
|
662 |
+
" <td>0</td>\n",
|
663 |
+
" <td>0</td>\n",
|
664 |
+
" <td>0</td>\n",
|
665 |
+
" <td>0</td>\n",
|
666 |
+
" <td>1</td>\n",
|
667 |
+
" <td>Turgay denilen bu holigonda bir sorun yok, gur...</td>\n",
|
668 |
+
" </tr>\n",
|
669 |
+
" <tr>\n",
|
670 |
+
" <th>43347</th>\n",
|
671 |
+
" <td>101</td>\n",
|
672 |
+
" <td>81424</td>\n",
|
673 |
+
" <td>26398</td>\n",
|
674 |
+
" <td>92017</td>\n",
|
675 |
+
" <td>109620</td>\n",
|
676 |
+
" <td>10941</td>\n",
|
677 |
+
" <td>76010</td>\n",
|
678 |
+
" <td>10115</td>\n",
|
679 |
+
" <td>19830</td>\n",
|
680 |
+
" <td>26083</td>\n",
|
681 |
+
" <td>...</td>\n",
|
682 |
+
" <td>0</td>\n",
|
683 |
+
" <td>0</td>\n",
|
684 |
+
" <td>0</td>\n",
|
685 |
+
" <td>0</td>\n",
|
686 |
+
" <td>0</td>\n",
|
687 |
+
" <td>0</td>\n",
|
688 |
+
" <td>0</td>\n",
|
689 |
+
" <td>0</td>\n",
|
690 |
+
" <td>1</td>\n",
|
691 |
+
" <td>Umarım ülkenin düşük zekadan kurtulması ilgile...</td>\n",
|
692 |
+
" </tr>\n",
|
693 |
+
" <tr>\n",
|
694 |
+
" <th>43348</th>\n",
|
695 |
+
" <td>101</td>\n",
|
696 |
+
" <td>39774</td>\n",
|
697 |
+
" <td>11127</td>\n",
|
698 |
+
" <td>45989</td>\n",
|
699 |
+
" <td>24596</td>\n",
|
700 |
+
" <td>11933</td>\n",
|
701 |
+
" <td>170</td>\n",
|
702 |
+
" <td>17145</td>\n",
|
703 |
+
" <td>10710</td>\n",
|
704 |
+
" <td>39125</td>\n",
|
705 |
+
" <td>...</td>\n",
|
706 |
+
" <td>0</td>\n",
|
707 |
+
" <td>0</td>\n",
|
708 |
+
" <td>0</td>\n",
|
709 |
+
" <td>0</td>\n",
|
710 |
+
" <td>0</td>\n",
|
711 |
+
" <td>0</td>\n",
|
712 |
+
" <td>0</td>\n",
|
713 |
+
" <td>0</td>\n",
|
714 |
+
" <td>1</td>\n",
|
715 |
+
" <td>CHP sandıkları bırakmaz, üzerine oturur, bir c...</td>\n",
|
716 |
+
" </tr>\n",
|
717 |
+
" </tbody>\n",
|
718 |
+
"</table>\n",
|
719 |
+
"<p>43349 rows × 66 columns</p>\n",
|
720 |
+
"</div>"
|
721 |
+
],
|
722 |
+
"text/plain": [
|
723 |
+
" 0 1 2 3 4 5 6 7 8 \n",
|
724 |
+
"0 101 10110 175 78653 189 25285 15976 40840 276 \\\n",
|
725 |
+
"1 101 11589 10706 10713 10794 94698 30668 24883 117 \n",
|
726 |
+
"2 101 148 30471 10774 13785 13779 33642 14399 48271 \n",
|
727 |
+
"3 101 19319 16724 10118 10107 78323 12407 38959 22934 \n",
|
728 |
+
"4 101 30932 58706 58054 44907 10224 106583 10288 12524 \n",
|
729 |
+
"... ... ... ... ... ... ... ... ... ... \n",
|
730 |
+
"43344 101 20065 10161 115 115 103784 10774 21388 10245 \n",
|
731 |
+
"43345 101 139 80839 24109 13406 18985 16285 10163 11062 \n",
|
732 |
+
"43346 101 105549 102635 10140 26943 11499 110516 21899 11861 \n",
|
733 |
+
"43347 101 81424 26398 92017 109620 10941 76010 10115 19830 \n",
|
734 |
+
"43348 101 39774 11127 45989 24596 11933 170 17145 10710 \n",
|
735 |
+
"\n",
|
736 |
+
" 9 ... 56 57 58 59 60 61 62 63 labels \n",
|
737 |
+
"0 31623 ... 0 0 0 0 0 0 0 0 0 \\\n",
|
738 |
+
"1 23763 ... 0 0 0 0 0 0 0 0 0 \n",
|
739 |
+
"2 76686 ... 0 0 0 0 0 0 0 0 0 \n",
|
740 |
+
"3 10147 ... 0 0 0 0 0 0 0 0 0 \n",
|
741 |
+
"4 13878 ... 0 0 0 0 0 0 0 0 0 \n",
|
742 |
+
"... ... ... .. .. .. .. .. .. .. .. ... \n",
|
743 |
+
"43344 92067 ... 0 0 0 0 0 0 0 0 1 \n",
|
744 |
+
"43345 276 ... 0 0 0 0 0 0 0 0 1 \n",
|
745 |
+
"43346 10561 ... 0 0 0 0 0 0 0 0 1 \n",
|
746 |
+
"43347 26083 ... 0 0 0 0 0 0 0 0 1 \n",
|
747 |
+
"43348 39125 ... 0 0 0 0 0 0 0 0 1 \n",
|
748 |
+
"\n",
|
749 |
+
" tweet \n",
|
750 |
+
"0 en güzel uyuyan insan ödülü jeon jungkook'a g... \n",
|
751 |
+
"1 Mekanı cennet olsun, saygılar sayın avukatımı... \n",
|
752 |
+
"2 Kızlar aranızda kas yığını beylere düşenler ol... \n",
|
753 |
+
"3 Biraz ders çalışayım. Tembellik ve uyku düşman... \n",
|
754 |
+
"4 Trezeguet yerine El Sharawy daha iyi olmaz mı \n",
|
755 |
+
"... ... \n",
|
756 |
+
"43344 Hil**adamlar kesinlikle kelimeleri anlamıyorla... \n",
|
757 |
+
"43345 Böyle piçlerin çok erken ölmemelerini ve çok f... \n",
|
758 |
+
"43346 Turgay denilen bu holigonda bir sorun yok, gur... \n",
|
759 |
+
"43347 Umarım ülkenin düşük zekadan kurtulması ilgile... \n",
|
760 |
+
"43348 CHP sandıkları bırakmaz, üzerine oturur, bir c... \n",
|
761 |
+
"\n",
|
762 |
+
"[43349 rows x 66 columns]"
|
763 |
+
]
|
764 |
+
},
|
765 |
+
"execution_count": 4,
|
766 |
+
"metadata": {},
|
767 |
+
"output_type": "execute_result"
|
768 |
+
}
|
769 |
+
],
|
770 |
+
"source": [
|
771 |
+
"embeding_df"
|
772 |
+
]
|
773 |
+
},
|
774 |
+
{
|
775 |
+
"cell_type": "code",
|
776 |
+
"execution_count": 5,
|
777 |
+
"metadata": {},
|
778 |
+
"outputs": [],
|
779 |
+
"source": [
|
780 |
+
"data = embeding_df.tweet.values"
|
781 |
+
]
|
782 |
+
},
|
783 |
+
{
|
784 |
+
"cell_type": "code",
|
785 |
+
"execution_count": 6,
|
786 |
+
"metadata": {},
|
787 |
+
"outputs": [],
|
788 |
+
"source": [
|
789 |
+
"embeding_df=embeding_df.drop(['tweet'], axis=1)\n",
|
790 |
+
"copy_df=copy_df.drop(['tweet'], axis=1)"
|
791 |
+
]
|
792 |
+
},
|
793 |
+
{
|
794 |
+
"attachments": {},
|
795 |
+
"cell_type": "markdown",
|
796 |
+
"metadata": {},
|
797 |
+
"source": [
|
798 |
+
"# isolation forest"
|
799 |
+
]
|
800 |
+
},
|
801 |
+
{
|
802 |
+
"cell_type": "code",
|
803 |
+
"execution_count": 7,
|
804 |
+
"metadata": {},
|
805 |
+
"outputs": [],
|
806 |
+
"source": [
|
807 |
+
"from sklearn.ensemble import IsolationForest"
|
808 |
+
]
|
809 |
+
},
|
810 |
+
{
|
811 |
+
"cell_type": "code",
|
812 |
+
"execution_count": 8,
|
813 |
+
"metadata": {},
|
814 |
+
"outputs": [
|
815 |
+
{
|
816 |
+
"name": "stderr",
|
817 |
+
"output_type": "stream",
|
818 |
+
"text": [
|
819 |
+
"/home/sebit/anaconda3/envs/dl_env/lib/python3.9/site-packages/sklearn/base.py:439: UserWarning: X does not have valid feature names, but IsolationForest was fitted with feature names\n",
|
820 |
+
" warnings.warn(\n"
|
821 |
+
]
|
822 |
+
}
|
823 |
+
],
|
824 |
+
"source": [
|
825 |
+
"# Train the model\n",
|
826 |
+
"isf = IsolationForest(contamination=0.04)\n",
|
827 |
+
"isf.fit(embeding_df)\n",
|
828 |
+
"# Predictions\n",
|
829 |
+
"predictions = isf.predict(embeding_df)"
|
830 |
+
]
|
831 |
+
},
|
832 |
+
{
|
833 |
+
"cell_type": "code",
|
834 |
+
"execution_count": 9,
|
835 |
+
"metadata": {},
|
836 |
+
"outputs": [
|
837 |
+
{
|
838 |
+
"data": {
|
839 |
+
"text/html": [
|
840 |
+
"<div>\n",
|
841 |
+
"<style scoped>\n",
|
842 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
843 |
+
" vertical-align: middle;\n",
|
844 |
+
" }\n",
|
845 |
+
"\n",
|
846 |
+
" .dataframe tbody tr th {\n",
|
847 |
+
" vertical-align: top;\n",
|
848 |
+
" }\n",
|
849 |
+
"\n",
|
850 |
+
" .dataframe thead th {\n",
|
851 |
+
" text-align: right;\n",
|
852 |
+
" }\n",
|
853 |
+
"</style>\n",
|
854 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
855 |
+
" <thead>\n",
|
856 |
+
" <tr style=\"text-align: right;\">\n",
|
857 |
+
" <th></th>\n",
|
858 |
+
" <th>0</th>\n",
|
859 |
+
" <th>1</th>\n",
|
860 |
+
" <th>2</th>\n",
|
861 |
+
" <th>3</th>\n",
|
862 |
+
" <th>4</th>\n",
|
863 |
+
" <th>5</th>\n",
|
864 |
+
" <th>6</th>\n",
|
865 |
+
" <th>7</th>\n",
|
866 |
+
" <th>8</th>\n",
|
867 |
+
" <th>9</th>\n",
|
868 |
+
" <th>...</th>\n",
|
869 |
+
" <th>57</th>\n",
|
870 |
+
" <th>58</th>\n",
|
871 |
+
" <th>59</th>\n",
|
872 |
+
" <th>60</th>\n",
|
873 |
+
" <th>61</th>\n",
|
874 |
+
" <th>62</th>\n",
|
875 |
+
" <th>63</th>\n",
|
876 |
+
" <th>labels</th>\n",
|
877 |
+
" <th>iso_forest_scores</th>\n",
|
878 |
+
" <th>iso_forest_outliers</th>\n",
|
879 |
+
" </tr>\n",
|
880 |
+
" </thead>\n",
|
881 |
+
" <tbody>\n",
|
882 |
+
" <tr>\n",
|
883 |
+
" <th>count</th>\n",
|
884 |
+
" <td>43349.0</td>\n",
|
885 |
+
" <td>43349.000000</td>\n",
|
886 |
+
" <td>43349.000000</td>\n",
|
887 |
+
" <td>43349.000000</td>\n",
|
888 |
+
" <td>43349.000000</td>\n",
|
889 |
+
" <td>43349.000000</td>\n",
|
890 |
+
" <td>43349.000000</td>\n",
|
891 |
+
" <td>43349.000000</td>\n",
|
892 |
+
" <td>43349.000000</td>\n",
|
893 |
+
" <td>43349.000000</td>\n",
|
894 |
+
" <td>...</td>\n",
|
895 |
+
" <td>43349.000000</td>\n",
|
896 |
+
" <td>43349.000000</td>\n",
|
897 |
+
" <td>43349.00000</td>\n",
|
898 |
+
" <td>43349.000000</td>\n",
|
899 |
+
" <td>43349.000000</td>\n",
|
900 |
+
" <td>43349.000000</td>\n",
|
901 |
+
" <td>43349.000000</td>\n",
|
902 |
+
" <td>43349.000000</td>\n",
|
903 |
+
" <td>43349.000000</td>\n",
|
904 |
+
" <td>43349.000000</td>\n",
|
905 |
+
" </tr>\n",
|
906 |
+
" <tr>\n",
|
907 |
+
" <th>mean</th>\n",
|
908 |
+
" <td>101.0</td>\n",
|
909 |
+
" <td>27403.389559</td>\n",
|
910 |
+
" <td>29588.353803</td>\n",
|
911 |
+
" <td>26720.445131</td>\n",
|
912 |
+
" <td>27755.110106</td>\n",
|
913 |
+
" <td>27346.753628</td>\n",
|
914 |
+
" <td>27713.189255</td>\n",
|
915 |
+
" <td>27295.717687</td>\n",
|
916 |
+
" <td>27136.227410</td>\n",
|
917 |
+
" <td>26812.611156</td>\n",
|
918 |
+
" <td>...</td>\n",
|
919 |
+
" <td>4868.917184</td>\n",
|
920 |
+
" <td>4813.145309</td>\n",
|
921 |
+
" <td>4733.38919</td>\n",
|
922 |
+
" <td>4389.068375</td>\n",
|
923 |
+
" <td>4297.575723</td>\n",
|
924 |
+
" <td>4176.437080</td>\n",
|
925 |
+
" <td>17.000392</td>\n",
|
926 |
+
" <td>0.417957</td>\n",
|
927 |
+
" <td>0.135546</td>\n",
|
928 |
+
" <td>0.919998</td>\n",
|
929 |
+
" </tr>\n",
|
930 |
+
" <tr>\n",
|
931 |
+
" <th>std</th>\n",
|
932 |
+
" <td>0.0</td>\n",
|
933 |
+
" <td>27382.274693</td>\n",
|
934 |
+
" <td>27727.688965</td>\n",
|
935 |
+
" <td>26455.267691</td>\n",
|
936 |
+
" <td>27026.611068</td>\n",
|
937 |
+
" <td>26799.753823</td>\n",
|
938 |
+
" <td>27021.950023</td>\n",
|
939 |
+
" <td>26761.847936</td>\n",
|
940 |
+
" <td>26820.810219</td>\n",
|
941 |
+
" <td>26720.480625</td>\n",
|
942 |
+
" <td>...</td>\n",
|
943 |
+
" <td>15312.358275</td>\n",
|
944 |
+
" <td>15491.136511</td>\n",
|
945 |
+
" <td>15387.09038</td>\n",
|
946 |
+
" <td>14617.253040</td>\n",
|
947 |
+
" <td>14643.580886</td>\n",
|
948 |
+
" <td>14405.397208</td>\n",
|
949 |
+
" <td>38.013945</td>\n",
|
950 |
+
" <td>0.493229</td>\n",
|
951 |
+
" <td>0.066701</td>\n",
|
952 |
+
" <td>0.391927</td>\n",
|
953 |
+
" </tr>\n",
|
954 |
+
" <tr>\n",
|
955 |
+
" <th>min</th>\n",
|
956 |
+
" <td>101.0</td>\n",
|
957 |
+
" <td>100.000000</td>\n",
|
958 |
+
" <td>100.000000</td>\n",
|
959 |
+
" <td>0.000000</td>\n",
|
960 |
+
" <td>0.000000</td>\n",
|
961 |
+
" <td>0.000000</td>\n",
|
962 |
+
" <td>0.000000</td>\n",
|
963 |
+
" <td>0.000000</td>\n",
|
964 |
+
" <td>0.000000</td>\n",
|
965 |
+
" <td>0.000000</td>\n",
|
966 |
+
" <td>...</td>\n",
|
967 |
+
" <td>0.000000</td>\n",
|
968 |
+
" <td>0.000000</td>\n",
|
969 |
+
" <td>0.00000</td>\n",
|
970 |
+
" <td>0.000000</td>\n",
|
971 |
+
" <td>0.000000</td>\n",
|
972 |
+
" <td>0.000000</td>\n",
|
973 |
+
" <td>0.000000</td>\n",
|
974 |
+
" <td>0.000000</td>\n",
|
975 |
+
" <td>-0.140643</td>\n",
|
976 |
+
" <td>-1.000000</td>\n",
|
977 |
+
" </tr>\n",
|
978 |
+
" <tr>\n",
|
979 |
+
" <th>25%</th>\n",
|
980 |
+
" <td>101.0</td>\n",
|
981 |
+
" <td>10357.000000</td>\n",
|
982 |
+
" <td>10506.000000</td>\n",
|
983 |
+
" <td>10323.000000</td>\n",
|
984 |
+
" <td>10361.000000</td>\n",
|
985 |
+
" <td>10350.000000</td>\n",
|
986 |
+
" <td>10369.000000</td>\n",
|
987 |
+
" <td>10347.000000</td>\n",
|
988 |
+
" <td>10330.000000</td>\n",
|
989 |
+
" <td>10323.000000</td>\n",
|
990 |
+
" <td>...</td>\n",
|
991 |
+
" <td>0.000000</td>\n",
|
992 |
+
" <td>0.000000</td>\n",
|
993 |
+
" <td>0.00000</td>\n",
|
994 |
+
" <td>0.000000</td>\n",
|
995 |
+
" <td>0.000000</td>\n",
|
996 |
+
" <td>0.000000</td>\n",
|
997 |
+
" <td>0.000000</td>\n",
|
998 |
+
" <td>0.000000</td>\n",
|
999 |
+
" <td>0.089100</td>\n",
|
1000 |
+
" <td>1.000000</td>\n",
|
1001 |
+
" </tr>\n",
|
1002 |
+
" <tr>\n",
|
1003 |
+
" <th>50%</th>\n",
|
1004 |
+
" <td>101.0</td>\n",
|
1005 |
+
" <td>18856.000000</td>\n",
|
1006 |
+
" <td>16263.000000</td>\n",
|
1007 |
+
" <td>13587.000000</td>\n",
|
1008 |
+
" <td>14918.000000</td>\n",
|
1009 |
+
" <td>14753.000000</td>\n",
|
1010 |
+
" <td>15090.000000</td>\n",
|
1011 |
+
" <td>14777.000000</td>\n",
|
1012 |
+
" <td>14753.000000</td>\n",
|
1013 |
+
" <td>14110.000000</td>\n",
|
1014 |
+
" <td>...</td>\n",
|
1015 |
+
" <td>0.000000</td>\n",
|
1016 |
+
" <td>0.000000</td>\n",
|
1017 |
+
" <td>0.00000</td>\n",
|
1018 |
+
" <td>0.000000</td>\n",
|
1019 |
+
" <td>0.000000</td>\n",
|
1020 |
+
" <td>0.000000</td>\n",
|
1021 |
+
" <td>0.000000</td>\n",
|
1022 |
+
" <td>0.000000</td>\n",
|
1023 |
+
" <td>0.161505</td>\n",
|
1024 |
+
" <td>1.000000</td>\n",
|
1025 |
+
" </tr>\n",
|
1026 |
+
" <tr>\n",
|
1027 |
+
" <th>75%</th>\n",
|
1028 |
+
" <td>101.0</td>\n",
|
1029 |
+
" <td>41079.000000</td>\n",
|
1030 |
+
" <td>40762.000000</td>\n",
|
1031 |
+
" <td>35943.000000</td>\n",
|
1032 |
+
" <td>37820.000000</td>\n",
|
1033 |
+
" <td>36544.000000</td>\n",
|
1034 |
+
" <td>37820.000000</td>\n",
|
1035 |
+
" <td>36723.000000</td>\n",
|
1036 |
+
" <td>36544.000000</td>\n",
|
1037 |
+
" <td>36445.000000</td>\n",
|
1038 |
+
" <td>...</td>\n",
|
1039 |
+
" <td>0.000000</td>\n",
|
1040 |
+
" <td>0.000000</td>\n",
|
1041 |
+
" <td>0.00000</td>\n",
|
1042 |
+
" <td>0.000000</td>\n",
|
1043 |
+
" <td>0.000000</td>\n",
|
1044 |
+
" <td>0.000000</td>\n",
|
1045 |
+
" <td>0.000000</td>\n",
|
1046 |
+
" <td>1.000000</td>\n",
|
1047 |
+
" <td>0.189511</td>\n",
|
1048 |
+
" <td>1.000000</td>\n",
|
1049 |
+
" </tr>\n",
|
1050 |
+
" <tr>\n",
|
1051 |
+
" <th>max</th>\n",
|
1052 |
+
" <td>101.0</td>\n",
|
1053 |
+
" <td>110744.000000</td>\n",
|
1054 |
+
" <td>110966.000000</td>\n",
|
1055 |
+
" <td>110966.000000</td>\n",
|
1056 |
+
" <td>110966.000000</td>\n",
|
1057 |
+
" <td>111720.000000</td>\n",
|
1058 |
+
" <td>111720.000000</td>\n",
|
1059 |
+
" <td>111720.000000</td>\n",
|
1060 |
+
" <td>111720.000000</td>\n",
|
1061 |
+
" <td>111720.000000</td>\n",
|
1062 |
+
" <td>...</td>\n",
|
1063 |
+
" <td>110966.000000</td>\n",
|
1064 |
+
" <td>110966.000000</td>\n",
|
1065 |
+
" <td>110966.00000</td>\n",
|
1066 |
+
" <td>110966.000000</td>\n",
|
1067 |
+
" <td>110966.000000</td>\n",
|
1068 |
+
" <td>110966.000000</td>\n",
|
1069 |
+
" <td>102.000000</td>\n",
|
1070 |
+
" <td>1.000000</td>\n",
|
1071 |
+
" <td>0.216831</td>\n",
|
1072 |
+
" <td>1.000000</td>\n",
|
1073 |
+
" </tr>\n",
|
1074 |
+
" </tbody>\n",
|
1075 |
+
"</table>\n",
|
1076 |
+
"<p>8 rows × 67 columns</p>\n",
|
1077 |
+
"</div>"
|
1078 |
+
],
|
1079 |
+
"text/plain": [
|
1080 |
+
" 0 1 2 3 4 \n",
|
1081 |
+
"count 43349.0 43349.000000 43349.000000 43349.000000 43349.000000 \\\n",
|
1082 |
+
"mean 101.0 27403.389559 29588.353803 26720.445131 27755.110106 \n",
|
1083 |
+
"std 0.0 27382.274693 27727.688965 26455.267691 27026.611068 \n",
|
1084 |
+
"min 101.0 100.000000 100.000000 0.000000 0.000000 \n",
|
1085 |
+
"25% 101.0 10357.000000 10506.000000 10323.000000 10361.000000 \n",
|
1086 |
+
"50% 101.0 18856.000000 16263.000000 13587.000000 14918.000000 \n",
|
1087 |
+
"75% 101.0 41079.000000 40762.000000 35943.000000 37820.000000 \n",
|
1088 |
+
"max 101.0 110744.000000 110966.000000 110966.000000 110966.000000 \n",
|
1089 |
+
"\n",
|
1090 |
+
" 5 6 7 8 \n",
|
1091 |
+
"count 43349.000000 43349.000000 43349.000000 43349.000000 \\\n",
|
1092 |
+
"mean 27346.753628 27713.189255 27295.717687 27136.227410 \n",
|
1093 |
+
"std 26799.753823 27021.950023 26761.847936 26820.810219 \n",
|
1094 |
+
"min 0.000000 0.000000 0.000000 0.000000 \n",
|
1095 |
+
"25% 10350.000000 10369.000000 10347.000000 10330.000000 \n",
|
1096 |
+
"50% 14753.000000 15090.000000 14777.000000 14753.000000 \n",
|
1097 |
+
"75% 36544.000000 37820.000000 36723.000000 36544.000000 \n",
|
1098 |
+
"max 111720.000000 111720.000000 111720.000000 111720.000000 \n",
|
1099 |
+
"\n",
|
1100 |
+
" 9 ... 57 58 59 \n",
|
1101 |
+
"count 43349.000000 ... 43349.000000 43349.000000 43349.00000 \\\n",
|
1102 |
+
"mean 26812.611156 ... 4868.917184 4813.145309 4733.38919 \n",
|
1103 |
+
"std 26720.480625 ... 15312.358275 15491.136511 15387.09038 \n",
|
1104 |
+
"min 0.000000 ... 0.000000 0.000000 0.00000 \n",
|
1105 |
+
"25% 10323.000000 ... 0.000000 0.000000 0.00000 \n",
|
1106 |
+
"50% 14110.000000 ... 0.000000 0.000000 0.00000 \n",
|
1107 |
+
"75% 36445.000000 ... 0.000000 0.000000 0.00000 \n",
|
1108 |
+
"max 111720.000000 ... 110966.000000 110966.000000 110966.00000 \n",
|
1109 |
+
"\n",
|
1110 |
+
" 60 61 62 63 \n",
|
1111 |
+
"count 43349.000000 43349.000000 43349.000000 43349.000000 \\\n",
|
1112 |
+
"mean 4389.068375 4297.575723 4176.437080 17.000392 \n",
|
1113 |
+
"std 14617.253040 14643.580886 14405.397208 38.013945 \n",
|
1114 |
+
"min 0.000000 0.000000 0.000000 0.000000 \n",
|
1115 |
+
"25% 0.000000 0.000000 0.000000 0.000000 \n",
|
1116 |
+
"50% 0.000000 0.000000 0.000000 0.000000 \n",
|
1117 |
+
"75% 0.000000 0.000000 0.000000 0.000000 \n",
|
1118 |
+
"max 110966.000000 110966.000000 110966.000000 102.000000 \n",
|
1119 |
+
"\n",
|
1120 |
+
" labels iso_forest_scores iso_forest_outliers \n",
|
1121 |
+
"count 43349.000000 43349.000000 43349.000000 \n",
|
1122 |
+
"mean 0.417957 0.135546 0.919998 \n",
|
1123 |
+
"std 0.493229 0.066701 0.391927 \n",
|
1124 |
+
"min 0.000000 -0.140643 -1.000000 \n",
|
1125 |
+
"25% 0.000000 0.089100 1.000000 \n",
|
1126 |
+
"50% 0.000000 0.161505 1.000000 \n",
|
1127 |
+
"75% 1.000000 0.189511 1.000000 \n",
|
1128 |
+
"max 1.000000 0.216831 1.000000 \n",
|
1129 |
+
"\n",
|
1130 |
+
"[8 rows x 67 columns]"
|
1131 |
+
]
|
1132 |
+
},
|
1133 |
+
"execution_count": 9,
|
1134 |
+
"metadata": {},
|
1135 |
+
"output_type": "execute_result"
|
1136 |
+
}
|
1137 |
+
],
|
1138 |
+
"source": [
|
1139 |
+
"# Extract scores\n",
|
1140 |
+
"embeding_df[\"iso_forest_scores\"] = isf.decision_function(embeding_df)\n",
|
1141 |
+
"# Extract predictions\n",
|
1142 |
+
"embeding_df[\"iso_forest_outliers\"] = predictions\n",
|
1143 |
+
"# Describe the dataframe\n",
|
1144 |
+
"embeding_df.describe()"
|
1145 |
+
]
|
1146 |
+
},
|
1147 |
+
{
|
1148 |
+
"cell_type": "code",
|
1149 |
+
"execution_count": 10,
|
1150 |
+
"metadata": {},
|
1151 |
+
"outputs": [
|
1152 |
+
{
|
1153 |
+
"data": {
|
1154 |
+
"text/html": [
|
1155 |
+
"<div>\n",
|
1156 |
+
"<style scoped>\n",
|
1157 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
1158 |
+
" vertical-align: middle;\n",
|
1159 |
+
" }\n",
|
1160 |
+
"\n",
|
1161 |
+
" .dataframe tbody tr th {\n",
|
1162 |
+
" vertical-align: top;\n",
|
1163 |
+
" }\n",
|
1164 |
+
"\n",
|
1165 |
+
" .dataframe thead th {\n",
|
1166 |
+
" text-align: right;\n",
|
1167 |
+
" }\n",
|
1168 |
+
"</style>\n",
|
1169 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
1170 |
+
" <thead>\n",
|
1171 |
+
" <tr style=\"text-align: right;\">\n",
|
1172 |
+
" <th></th>\n",
|
1173 |
+
" <th>0</th>\n",
|
1174 |
+
" <th>1</th>\n",
|
1175 |
+
" <th>2</th>\n",
|
1176 |
+
" <th>3</th>\n",
|
1177 |
+
" <th>4</th>\n",
|
1178 |
+
" <th>5</th>\n",
|
1179 |
+
" <th>6</th>\n",
|
1180 |
+
" <th>7</th>\n",
|
1181 |
+
" <th>8</th>\n",
|
1182 |
+
" <th>9</th>\n",
|
1183 |
+
" <th>...</th>\n",
|
1184 |
+
" <th>57</th>\n",
|
1185 |
+
" <th>58</th>\n",
|
1186 |
+
" <th>59</th>\n",
|
1187 |
+
" <th>60</th>\n",
|
1188 |
+
" <th>61</th>\n",
|
1189 |
+
" <th>62</th>\n",
|
1190 |
+
" <th>63</th>\n",
|
1191 |
+
" <th>labels</th>\n",
|
1192 |
+
" <th>iso_forest_scores</th>\n",
|
1193 |
+
" <th>iso_forest_outliers</th>\n",
|
1194 |
+
" </tr>\n",
|
1195 |
+
" </thead>\n",
|
1196 |
+
" <tbody>\n",
|
1197 |
+
" <tr>\n",
|
1198 |
+
" <th>0</th>\n",
|
1199 |
+
" <td>101</td>\n",
|
1200 |
+
" <td>10110</td>\n",
|
1201 |
+
" <td>175</td>\n",
|
1202 |
+
" <td>78653</td>\n",
|
1203 |
+
" <td>189</td>\n",
|
1204 |
+
" <td>25285</td>\n",
|
1205 |
+
" <td>15976</td>\n",
|
1206 |
+
" <td>40840</td>\n",
|
1207 |
+
" <td>276</td>\n",
|
1208 |
+
" <td>31623</td>\n",
|
1209 |
+
" <td>...</td>\n",
|
1210 |
+
" <td>0</td>\n",
|
1211 |
+
" <td>0</td>\n",
|
1212 |
+
" <td>0</td>\n",
|
1213 |
+
" <td>0</td>\n",
|
1214 |
+
" <td>0</td>\n",
|
1215 |
+
" <td>0</td>\n",
|
1216 |
+
" <td>0</td>\n",
|
1217 |
+
" <td>0</td>\n",
|
1218 |
+
" <td>0.189202</td>\n",
|
1219 |
+
" <td>No</td>\n",
|
1220 |
+
" </tr>\n",
|
1221 |
+
" <tr>\n",
|
1222 |
+
" <th>1</th>\n",
|
1223 |
+
" <td>101</td>\n",
|
1224 |
+
" <td>11589</td>\n",
|
1225 |
+
" <td>10706</td>\n",
|
1226 |
+
" <td>10713</td>\n",
|
1227 |
+
" <td>10794</td>\n",
|
1228 |
+
" <td>94698</td>\n",
|
1229 |
+
" <td>30668</td>\n",
|
1230 |
+
" <td>24883</td>\n",
|
1231 |
+
" <td>117</td>\n",
|
1232 |
+
" <td>23763</td>\n",
|
1233 |
+
" <td>...</td>\n",
|
1234 |
+
" <td>0</td>\n",
|
1235 |
+
" <td>0</td>\n",
|
1236 |
+
" <td>0</td>\n",
|
1237 |
+
" <td>0</td>\n",
|
1238 |
+
" <td>0</td>\n",
|
1239 |
+
" <td>0</td>\n",
|
1240 |
+
" <td>0</td>\n",
|
1241 |
+
" <td>0</td>\n",
|
1242 |
+
" <td>0.181234</td>\n",
|
1243 |
+
" <td>No</td>\n",
|
1244 |
+
" </tr>\n",
|
1245 |
+
" <tr>\n",
|
1246 |
+
" <th>2</th>\n",
|
1247 |
+
" <td>101</td>\n",
|
1248 |
+
" <td>148</td>\n",
|
1249 |
+
" <td>30471</td>\n",
|
1250 |
+
" <td>10774</td>\n",
|
1251 |
+
" <td>13785</td>\n",
|
1252 |
+
" <td>13779</td>\n",
|
1253 |
+
" <td>33642</td>\n",
|
1254 |
+
" <td>14399</td>\n",
|
1255 |
+
" <td>48271</td>\n",
|
1256 |
+
" <td>76686</td>\n",
|
1257 |
+
" <td>...</td>\n",
|
1258 |
+
" <td>0</td>\n",
|
1259 |
+
" <td>0</td>\n",
|
1260 |
+
" <td>0</td>\n",
|
1261 |
+
" <td>0</td>\n",
|
1262 |
+
" <td>0</td>\n",
|
1263 |
+
" <td>0</td>\n",
|
1264 |
+
" <td>0</td>\n",
|
1265 |
+
" <td>0</td>\n",
|
1266 |
+
" <td>0.166332</td>\n",
|
1267 |
+
" <td>No</td>\n",
|
1268 |
+
" </tr>\n",
|
1269 |
+
" <tr>\n",
|
1270 |
+
" <th>3</th>\n",
|
1271 |
+
" <td>101</td>\n",
|
1272 |
+
" <td>19319</td>\n",
|
1273 |
+
" <td>16724</td>\n",
|
1274 |
+
" <td>10118</td>\n",
|
1275 |
+
" <td>10107</td>\n",
|
1276 |
+
" <td>78323</td>\n",
|
1277 |
+
" <td>12407</td>\n",
|
1278 |
+
" <td>38959</td>\n",
|
1279 |
+
" <td>22934</td>\n",
|
1280 |
+
" <td>10147</td>\n",
|
1281 |
+
" <td>...</td>\n",
|
1282 |
+
" <td>0</td>\n",
|
1283 |
+
" <td>0</td>\n",
|
1284 |
+
" <td>0</td>\n",
|
1285 |
+
" <td>0</td>\n",
|
1286 |
+
" <td>0</td>\n",
|
1287 |
+
" <td>0</td>\n",
|
1288 |
+
" <td>0</td>\n",
|
1289 |
+
" <td>0</td>\n",
|
1290 |
+
" <td>0.151816</td>\n",
|
1291 |
+
" <td>No</td>\n",
|
1292 |
+
" </tr>\n",
|
1293 |
+
" <tr>\n",
|
1294 |
+
" <th>4</th>\n",
|
1295 |
+
" <td>101</td>\n",
|
1296 |
+
" <td>30932</td>\n",
|
1297 |
+
" <td>58706</td>\n",
|
1298 |
+
" <td>58054</td>\n",
|
1299 |
+
" <td>44907</td>\n",
|
1300 |
+
" <td>10224</td>\n",
|
1301 |
+
" <td>106583</td>\n",
|
1302 |
+
" <td>10288</td>\n",
|
1303 |
+
" <td>12524</td>\n",
|
1304 |
+
" <td>13878</td>\n",
|
1305 |
+
" <td>...</td>\n",
|
1306 |
+
" <td>0</td>\n",
|
1307 |
+
" <td>0</td>\n",
|
1308 |
+
" <td>0</td>\n",
|
1309 |
+
" <td>0</td>\n",
|
1310 |
+
" <td>0</td>\n",
|
1311 |
+
" <td>0</td>\n",
|
1312 |
+
" <td>0</td>\n",
|
1313 |
+
" <td>0</td>\n",
|
1314 |
+
" <td>0.184008</td>\n",
|
1315 |
+
" <td>No</td>\n",
|
1316 |
+
" </tr>\n",
|
1317 |
+
" <tr>\n",
|
1318 |
+
" <th>...</th>\n",
|
1319 |
+
" <td>...</td>\n",
|
1320 |
+
" <td>...</td>\n",
|
1321 |
+
" <td>...</td>\n",
|
1322 |
+
" <td>...</td>\n",
|
1323 |
+
" <td>...</td>\n",
|
1324 |
+
" <td>...</td>\n",
|
1325 |
+
" <td>...</td>\n",
|
1326 |
+
" <td>...</td>\n",
|
1327 |
+
" <td>...</td>\n",
|
1328 |
+
" <td>...</td>\n",
|
1329 |
+
" <td>...</td>\n",
|
1330 |
+
" <td>...</td>\n",
|
1331 |
+
" <td>...</td>\n",
|
1332 |
+
" <td>...</td>\n",
|
1333 |
+
" <td>...</td>\n",
|
1334 |
+
" <td>...</td>\n",
|
1335 |
+
" <td>...</td>\n",
|
1336 |
+
" <td>...</td>\n",
|
1337 |
+
" <td>...</td>\n",
|
1338 |
+
" <td>...</td>\n",
|
1339 |
+
" <td>...</td>\n",
|
1340 |
+
" </tr>\n",
|
1341 |
+
" <tr>\n",
|
1342 |
+
" <th>43344</th>\n",
|
1343 |
+
" <td>101</td>\n",
|
1344 |
+
" <td>20065</td>\n",
|
1345 |
+
" <td>10161</td>\n",
|
1346 |
+
" <td>115</td>\n",
|
1347 |
+
" <td>115</td>\n",
|
1348 |
+
" <td>103784</td>\n",
|
1349 |
+
" <td>10774</td>\n",
|
1350 |
+
" <td>21388</td>\n",
|
1351 |
+
" <td>10245</td>\n",
|
1352 |
+
" <td>92067</td>\n",
|
1353 |
+
" <td>...</td>\n",
|
1354 |
+
" <td>0</td>\n",
|
1355 |
+
" <td>0</td>\n",
|
1356 |
+
" <td>0</td>\n",
|
1357 |
+
" <td>0</td>\n",
|
1358 |
+
" <td>0</td>\n",
|
1359 |
+
" <td>0</td>\n",
|
1360 |
+
" <td>0</td>\n",
|
1361 |
+
" <td>1</td>\n",
|
1362 |
+
" <td>0.079412</td>\n",
|
1363 |
+
" <td>No</td>\n",
|
1364 |
+
" </tr>\n",
|
1365 |
+
" <tr>\n",
|
1366 |
+
" <th>43345</th>\n",
|
1367 |
+
" <td>101</td>\n",
|
1368 |
+
" <td>139</td>\n",
|
1369 |
+
" <td>80839</td>\n",
|
1370 |
+
" <td>24109</td>\n",
|
1371 |
+
" <td>13406</td>\n",
|
1372 |
+
" <td>18985</td>\n",
|
1373 |
+
" <td>16285</td>\n",
|
1374 |
+
" <td>10163</td>\n",
|
1375 |
+
" <td>11062</td>\n",
|
1376 |
+
" <td>276</td>\n",
|
1377 |
+
" <td>...</td>\n",
|
1378 |
+
" <td>0</td>\n",
|
1379 |
+
" <td>0</td>\n",
|
1380 |
+
" <td>0</td>\n",
|
1381 |
+
" <td>0</td>\n",
|
1382 |
+
" <td>0</td>\n",
|
1383 |
+
" <td>0</td>\n",
|
1384 |
+
" <td>0</td>\n",
|
1385 |
+
" <td>1</td>\n",
|
1386 |
+
" <td>0.118245</td>\n",
|
1387 |
+
" <td>No</td>\n",
|
1388 |
+
" </tr>\n",
|
1389 |
+
" <tr>\n",
|
1390 |
+
" <th>43346</th>\n",
|
1391 |
+
" <td>101</td>\n",
|
1392 |
+
" <td>105549</td>\n",
|
1393 |
+
" <td>102635</td>\n",
|
1394 |
+
" <td>10140</td>\n",
|
1395 |
+
" <td>26943</td>\n",
|
1396 |
+
" <td>11499</td>\n",
|
1397 |
+
" <td>110516</td>\n",
|
1398 |
+
" <td>21899</td>\n",
|
1399 |
+
" <td>11861</td>\n",
|
1400 |
+
" <td>10561</td>\n",
|
1401 |
+
" <td>...</td>\n",
|
1402 |
+
" <td>0</td>\n",
|
1403 |
+
" <td>0</td>\n",
|
1404 |
+
" <td>0</td>\n",
|
1405 |
+
" <td>0</td>\n",
|
1406 |
+
" <td>0</td>\n",
|
1407 |
+
" <td>0</td>\n",
|
1408 |
+
" <td>0</td>\n",
|
1409 |
+
" <td>1</td>\n",
|
1410 |
+
" <td>0.138229</td>\n",
|
1411 |
+
" <td>No</td>\n",
|
1412 |
+
" </tr>\n",
|
1413 |
+
" <tr>\n",
|
1414 |
+
" <th>43347</th>\n",
|
1415 |
+
" <td>101</td>\n",
|
1416 |
+
" <td>81424</td>\n",
|
1417 |
+
" <td>26398</td>\n",
|
1418 |
+
" <td>92017</td>\n",
|
1419 |
+
" <td>109620</td>\n",
|
1420 |
+
" <td>10941</td>\n",
|
1421 |
+
" <td>76010</td>\n",
|
1422 |
+
" <td>10115</td>\n",
|
1423 |
+
" <td>19830</td>\n",
|
1424 |
+
" <td>26083</td>\n",
|
1425 |
+
" <td>...</td>\n",
|
1426 |
+
" <td>0</td>\n",
|
1427 |
+
" <td>0</td>\n",
|
1428 |
+
" <td>0</td>\n",
|
1429 |
+
" <td>0</td>\n",
|
1430 |
+
" <td>0</td>\n",
|
1431 |
+
" <td>0</td>\n",
|
1432 |
+
" <td>0</td>\n",
|
1433 |
+
" <td>1</td>\n",
|
1434 |
+
" <td>0.181065</td>\n",
|
1435 |
+
" <td>No</td>\n",
|
1436 |
+
" </tr>\n",
|
1437 |
+
" <tr>\n",
|
1438 |
+
" <th>43348</th>\n",
|
1439 |
+
" <td>101</td>\n",
|
1440 |
+
" <td>39774</td>\n",
|
1441 |
+
" <td>11127</td>\n",
|
1442 |
+
" <td>45989</td>\n",
|
1443 |
+
" <td>24596</td>\n",
|
1444 |
+
" <td>11933</td>\n",
|
1445 |
+
" <td>170</td>\n",
|
1446 |
+
" <td>17145</td>\n",
|
1447 |
+
" <td>10710</td>\n",
|
1448 |
+
" <td>39125</td>\n",
|
1449 |
+
" <td>...</td>\n",
|
1450 |
+
" <td>0</td>\n",
|
1451 |
+
" <td>0</td>\n",
|
1452 |
+
" <td>0</td>\n",
|
1453 |
+
" <td>0</td>\n",
|
1454 |
+
" <td>0</td>\n",
|
1455 |
+
" <td>0</td>\n",
|
1456 |
+
" <td>0</td>\n",
|
1457 |
+
" <td>1</td>\n",
|
1458 |
+
" <td>0.085161</td>\n",
|
1459 |
+
" <td>No</td>\n",
|
1460 |
+
" </tr>\n",
|
1461 |
+
" </tbody>\n",
|
1462 |
+
"</table>\n",
|
1463 |
+
"<p>43349 rows × 67 columns</p>\n",
|
1464 |
+
"</div>"
|
1465 |
+
],
|
1466 |
+
"text/plain": [
|
1467 |
+
" 0 1 2 3 4 5 6 7 8 \n",
|
1468 |
+
"0 101 10110 175 78653 189 25285 15976 40840 276 \\\n",
|
1469 |
+
"1 101 11589 10706 10713 10794 94698 30668 24883 117 \n",
|
1470 |
+
"2 101 148 30471 10774 13785 13779 33642 14399 48271 \n",
|
1471 |
+
"3 101 19319 16724 10118 10107 78323 12407 38959 22934 \n",
|
1472 |
+
"4 101 30932 58706 58054 44907 10224 106583 10288 12524 \n",
|
1473 |
+
"... ... ... ... ... ... ... ... ... ... \n",
|
1474 |
+
"43344 101 20065 10161 115 115 103784 10774 21388 10245 \n",
|
1475 |
+
"43345 101 139 80839 24109 13406 18985 16285 10163 11062 \n",
|
1476 |
+
"43346 101 105549 102635 10140 26943 11499 110516 21899 11861 \n",
|
1477 |
+
"43347 101 81424 26398 92017 109620 10941 76010 10115 19830 \n",
|
1478 |
+
"43348 101 39774 11127 45989 24596 11933 170 17145 10710 \n",
|
1479 |
+
"\n",
|
1480 |
+
" 9 ... 57 58 59 60 61 62 63 labels iso_forest_scores \n",
|
1481 |
+
"0 31623 ... 0 0 0 0 0 0 0 0 0.189202 \\\n",
|
1482 |
+
"1 23763 ... 0 0 0 0 0 0 0 0 0.181234 \n",
|
1483 |
+
"2 76686 ... 0 0 0 0 0 0 0 0 0.166332 \n",
|
1484 |
+
"3 10147 ... 0 0 0 0 0 0 0 0 0.151816 \n",
|
1485 |
+
"4 13878 ... 0 0 0 0 0 0 0 0 0.184008 \n",
|
1486 |
+
"... ... ... .. .. .. .. .. .. .. ... ... \n",
|
1487 |
+
"43344 92067 ... 0 0 0 0 0 0 0 1 0.079412 \n",
|
1488 |
+
"43345 276 ... 0 0 0 0 0 0 0 1 0.118245 \n",
|
1489 |
+
"43346 10561 ... 0 0 0 0 0 0 0 1 0.138229 \n",
|
1490 |
+
"43347 26083 ... 0 0 0 0 0 0 0 1 0.181065 \n",
|
1491 |
+
"43348 39125 ... 0 0 0 0 0 0 0 1 0.085161 \n",
|
1492 |
+
"\n",
|
1493 |
+
" iso_forest_outliers \n",
|
1494 |
+
"0 No \n",
|
1495 |
+
"1 No \n",
|
1496 |
+
"2 No \n",
|
1497 |
+
"3 No \n",
|
1498 |
+
"4 No \n",
|
1499 |
+
"... ... \n",
|
1500 |
+
"43344 No \n",
|
1501 |
+
"43345 No \n",
|
1502 |
+
"43346 No \n",
|
1503 |
+
"43347 No \n",
|
1504 |
+
"43348 No \n",
|
1505 |
+
"\n",
|
1506 |
+
"[43349 rows x 67 columns]"
|
1507 |
+
]
|
1508 |
+
},
|
1509 |
+
"execution_count": 10,
|
1510 |
+
"metadata": {},
|
1511 |
+
"output_type": "execute_result"
|
1512 |
+
}
|
1513 |
+
],
|
1514 |
+
"source": [
|
1515 |
+
"# Replace \"-1\" with \"Yes\" and \"1\" with \"No\"\n",
|
1516 |
+
"embeding_df['iso_forest_outliers'] = embeding_df['iso_forest_outliers'].replace([-1, 1], [\"Yes\", \"No\"])\n",
|
1517 |
+
"# Print the first 5 firms\n",
|
1518 |
+
"embeding_df"
|
1519 |
+
]
|
1520 |
+
},
|
1521 |
+
{
|
1522 |
+
"cell_type": "code",
|
1523 |
+
"execution_count": 11,
|
1524 |
+
"metadata": {},
|
1525 |
+
"outputs": [
|
1526 |
+
{
|
1527 |
+
"data": {
|
1528 |
+
"text/plain": [
|
1529 |
+
"iso_forest_outliers\n",
|
1530 |
+
"False 43349\n",
|
1531 |
+
"Name: count, dtype: int64"
|
1532 |
+
]
|
1533 |
+
},
|
1534 |
+
"execution_count": 11,
|
1535 |
+
"metadata": {},
|
1536 |
+
"output_type": "execute_result"
|
1537 |
+
}
|
1538 |
+
],
|
1539 |
+
"source": [
|
1540 |
+
"(embeding_df['iso_forest_outliers']=='YES').value_counts()"
|
1541 |
+
]
|
1542 |
+
},
|
1543 |
+
{
|
1544 |
+
"attachments": {},
|
1545 |
+
"cell_type": "markdown",
|
1546 |
+
"metadata": {},
|
1547 |
+
"source": [
|
1548 |
+
"# lof"
|
1549 |
+
]
|
1550 |
+
},
|
1551 |
+
{
|
1552 |
+
"cell_type": "code",
|
1553 |
+
"execution_count": 12,
|
1554 |
+
"metadata": {},
|
1555 |
+
"outputs": [],
|
1556 |
+
"source": [
|
1557 |
+
"from sklearn.neighbors import LocalOutlierFactor\n",
|
1558 |
+
"from numpy import quantile, where, random"
|
1559 |
+
]
|
1560 |
+
},
|
1561 |
+
{
|
1562 |
+
"cell_type": "code",
|
1563 |
+
"execution_count": 13,
|
1564 |
+
"metadata": {},
|
1565 |
+
"outputs": [],
|
1566 |
+
"source": [
|
1567 |
+
"# Train the model\n",
|
1568 |
+
"clf = LocalOutlierFactor(n_neighbors=20, contamination=0.1)\n",
|
1569 |
+
"out=clf.fit_predict(copy_df)\n",
|
1570 |
+
"# Predictions\n",
|
1571 |
+
"lof = clf.negative_outlier_factor_\n",
|
1572 |
+
"embeding_df[\"lof_outliers\"] = lof\n",
|
1573 |
+
"embeding_df[\"outliers\"]= out"
|
1574 |
+
]
|
1575 |
+
},
|
1576 |
+
{
|
1577 |
+
"cell_type": "code",
|
1578 |
+
"execution_count": 14,
|
1579 |
+
"metadata": {},
|
1580 |
+
"outputs": [],
|
1581 |
+
"source": [
|
1582 |
+
"embeding_df['outliers'] = embeding_df['outliers'].replace([-1, 1], [\"Yes\", \"No\"])"
|
1583 |
+
]
|
1584 |
+
},
|
1585 |
+
{
|
1586 |
+
"cell_type": "code",
|
1587 |
+
"execution_count": 15,
|
1588 |
+
"metadata": {},
|
1589 |
+
"outputs": [],
|
1590 |
+
"source": [
|
1591 |
+
"embeding_df['tweet']=data"
|
1592 |
+
]
|
1593 |
+
},
|
1594 |
+
{
|
1595 |
+
"cell_type": "code",
|
1596 |
+
"execution_count": 16,
|
1597 |
+
"metadata": {},
|
1598 |
+
"outputs": [],
|
1599 |
+
"source": [
|
1600 |
+
"x=embeding_df[embeding_df['iso_forest_outliers']=='Yes' ]"
|
1601 |
+
]
|
1602 |
+
},
|
1603 |
+
{
|
1604 |
+
"cell_type": "code",
|
1605 |
+
"execution_count": 17,
|
1606 |
+
"metadata": {},
|
1607 |
+
"outputs": [],
|
1608 |
+
"source": [
|
1609 |
+
"embeding_df.drop(x.loc[x['outliers']=='Yes' ].index, inplace=True)"
|
1610 |
+
]
|
1611 |
+
},
|
1612 |
+
{
|
1613 |
+
"cell_type": "code",
|
1614 |
+
"execution_count": 18,
|
1615 |
+
"metadata": {},
|
1616 |
+
"outputs": [
|
1617 |
+
{
|
1618 |
+
"data": {
|
1619 |
+
"text/html": [
|
1620 |
+
"<div>\n",
|
1621 |
+
"<style scoped>\n",
|
1622 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
1623 |
+
" vertical-align: middle;\n",
|
1624 |
+
" }\n",
|
1625 |
+
"\n",
|
1626 |
+
" .dataframe tbody tr th {\n",
|
1627 |
+
" vertical-align: top;\n",
|
1628 |
+
" }\n",
|
1629 |
+
"\n",
|
1630 |
+
" .dataframe thead th {\n",
|
1631 |
+
" text-align: right;\n",
|
1632 |
+
" }\n",
|
1633 |
+
"</style>\n",
|
1634 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
1635 |
+
" <thead>\n",
|
1636 |
+
" <tr style=\"text-align: right;\">\n",
|
1637 |
+
" <th></th>\n",
|
1638 |
+
" <th>0</th>\n",
|
1639 |
+
" <th>1</th>\n",
|
1640 |
+
" <th>2</th>\n",
|
1641 |
+
" <th>3</th>\n",
|
1642 |
+
" <th>4</th>\n",
|
1643 |
+
" <th>5</th>\n",
|
1644 |
+
" <th>6</th>\n",
|
1645 |
+
" <th>7</th>\n",
|
1646 |
+
" <th>8</th>\n",
|
1647 |
+
" <th>9</th>\n",
|
1648 |
+
" <th>...</th>\n",
|
1649 |
+
" <th>60</th>\n",
|
1650 |
+
" <th>61</th>\n",
|
1651 |
+
" <th>62</th>\n",
|
1652 |
+
" <th>63</th>\n",
|
1653 |
+
" <th>labels</th>\n",
|
1654 |
+
" <th>iso_forest_scores</th>\n",
|
1655 |
+
" <th>iso_forest_outliers</th>\n",
|
1656 |
+
" <th>lof_outliers</th>\n",
|
1657 |
+
" <th>outliers</th>\n",
|
1658 |
+
" <th>tweet</th>\n",
|
1659 |
+
" </tr>\n",
|
1660 |
+
" </thead>\n",
|
1661 |
+
" <tbody>\n",
|
1662 |
+
" <tr>\n",
|
1663 |
+
" <th>0</th>\n",
|
1664 |
+
" <td>101</td>\n",
|
1665 |
+
" <td>10110</td>\n",
|
1666 |
+
" <td>175</td>\n",
|
1667 |
+
" <td>78653</td>\n",
|
1668 |
+
" <td>189</td>\n",
|
1669 |
+
" <td>25285</td>\n",
|
1670 |
+
" <td>15976</td>\n",
|
1671 |
+
" <td>40840</td>\n",
|
1672 |
+
" <td>276</td>\n",
|
1673 |
+
" <td>31623</td>\n",
|
1674 |
+
" <td>...</td>\n",
|
1675 |
+
" <td>0</td>\n",
|
1676 |
+
" <td>0</td>\n",
|
1677 |
+
" <td>0</td>\n",
|
1678 |
+
" <td>0</td>\n",
|
1679 |
+
" <td>0</td>\n",
|
1680 |
+
" <td>0.189202</td>\n",
|
1681 |
+
" <td>No</td>\n",
|
1682 |
+
" <td>-1.209681</td>\n",
|
1683 |
+
" <td>No</td>\n",
|
1684 |
+
" <td>en güzel uyuyan insan ödülü jeon jungkook'a g...</td>\n",
|
1685 |
+
" </tr>\n",
|
1686 |
+
" <tr>\n",
|
1687 |
+
" <th>1</th>\n",
|
1688 |
+
" <td>101</td>\n",
|
1689 |
+
" <td>11589</td>\n",
|
1690 |
+
" <td>10706</td>\n",
|
1691 |
+
" <td>10713</td>\n",
|
1692 |
+
" <td>10794</td>\n",
|
1693 |
+
" <td>94698</td>\n",
|
1694 |
+
" <td>30668</td>\n",
|
1695 |
+
" <td>24883</td>\n",
|
1696 |
+
" <td>117</td>\n",
|
1697 |
+
" <td>23763</td>\n",
|
1698 |
+
" <td>...</td>\n",
|
1699 |
+
" <td>0</td>\n",
|
1700 |
+
" <td>0</td>\n",
|
1701 |
+
" <td>0</td>\n",
|
1702 |
+
" <td>0</td>\n",
|
1703 |
+
" <td>0</td>\n",
|
1704 |
+
" <td>0.181234</td>\n",
|
1705 |
+
" <td>No</td>\n",
|
1706 |
+
" <td>-1.107479</td>\n",
|
1707 |
+
" <td>No</td>\n",
|
1708 |
+
" <td>Mekanı cennet olsun, saygılar sayın avukatımı...</td>\n",
|
1709 |
+
" </tr>\n",
|
1710 |
+
" <tr>\n",
|
1711 |
+
" <th>2</th>\n",
|
1712 |
+
" <td>101</td>\n",
|
1713 |
+
" <td>148</td>\n",
|
1714 |
+
" <td>30471</td>\n",
|
1715 |
+
" <td>10774</td>\n",
|
1716 |
+
" <td>13785</td>\n",
|
1717 |
+
" <td>13779</td>\n",
|
1718 |
+
" <td>33642</td>\n",
|
1719 |
+
" <td>14399</td>\n",
|
1720 |
+
" <td>48271</td>\n",
|
1721 |
+
" <td>76686</td>\n",
|
1722 |
+
" <td>...</td>\n",
|
1723 |
+
" <td>0</td>\n",
|
1724 |
+
" <td>0</td>\n",
|
1725 |
+
" <td>0</td>\n",
|
1726 |
+
" <td>0</td>\n",
|
1727 |
+
" <td>0</td>\n",
|
1728 |
+
" <td>0.166332</td>\n",
|
1729 |
+
" <td>No</td>\n",
|
1730 |
+
" <td>-1.202529</td>\n",
|
1731 |
+
" <td>No</td>\n",
|
1732 |
+
" <td>Kızlar aranızda kas yığını beylere düşenler ol...</td>\n",
|
1733 |
+
" </tr>\n",
|
1734 |
+
" <tr>\n",
|
1735 |
+
" <th>3</th>\n",
|
1736 |
+
" <td>101</td>\n",
|
1737 |
+
" <td>19319</td>\n",
|
1738 |
+
" <td>16724</td>\n",
|
1739 |
+
" <td>10118</td>\n",
|
1740 |
+
" <td>10107</td>\n",
|
1741 |
+
" <td>78323</td>\n",
|
1742 |
+
" <td>12407</td>\n",
|
1743 |
+
" <td>38959</td>\n",
|
1744 |
+
" <td>22934</td>\n",
|
1745 |
+
" <td>10147</td>\n",
|
1746 |
+
" <td>...</td>\n",
|
1747 |
+
" <td>0</td>\n",
|
1748 |
+
" <td>0</td>\n",
|
1749 |
+
" <td>0</td>\n",
|
1750 |
+
" <td>0</td>\n",
|
1751 |
+
" <td>0</td>\n",
|
1752 |
+
" <td>0.151816</td>\n",
|
1753 |
+
" <td>No</td>\n",
|
1754 |
+
" <td>-1.216599</td>\n",
|
1755 |
+
" <td>No</td>\n",
|
1756 |
+
" <td>Biraz ders çalışayım. Tembellik ve uyku düşman...</td>\n",
|
1757 |
+
" </tr>\n",
|
1758 |
+
" <tr>\n",
|
1759 |
+
" <th>4</th>\n",
|
1760 |
+
" <td>101</td>\n",
|
1761 |
+
" <td>30932</td>\n",
|
1762 |
+
" <td>58706</td>\n",
|
1763 |
+
" <td>58054</td>\n",
|
1764 |
+
" <td>44907</td>\n",
|
1765 |
+
" <td>10224</td>\n",
|
1766 |
+
" <td>106583</td>\n",
|
1767 |
+
" <td>10288</td>\n",
|
1768 |
+
" <td>12524</td>\n",
|
1769 |
+
" <td>13878</td>\n",
|
1770 |
+
" <td>...</td>\n",
|
1771 |
+
" <td>0</td>\n",
|
1772 |
+
" <td>0</td>\n",
|
1773 |
+
" <td>0</td>\n",
|
1774 |
+
" <td>0</td>\n",
|
1775 |
+
" <td>0</td>\n",
|
1776 |
+
" <td>0.184008</td>\n",
|
1777 |
+
" <td>No</td>\n",
|
1778 |
+
" <td>-1.188488</td>\n",
|
1779 |
+
" <td>No</td>\n",
|
1780 |
+
" <td>Trezeguet yerine El Sharawy daha iyi olmaz mı</td>\n",
|
1781 |
+
" </tr>\n",
|
1782 |
+
" <tr>\n",
|
1783 |
+
" <th>...</th>\n",
|
1784 |
+
" <td>...</td>\n",
|
1785 |
+
" <td>...</td>\n",
|
1786 |
+
" <td>...</td>\n",
|
1787 |
+
" <td>...</td>\n",
|
1788 |
+
" <td>...</td>\n",
|
1789 |
+
" <td>...</td>\n",
|
1790 |
+
" <td>...</td>\n",
|
1791 |
+
" <td>...</td>\n",
|
1792 |
+
" <td>...</td>\n",
|
1793 |
+
" <td>...</td>\n",
|
1794 |
+
" <td>...</td>\n",
|
1795 |
+
" <td>...</td>\n",
|
1796 |
+
" <td>...</td>\n",
|
1797 |
+
" <td>...</td>\n",
|
1798 |
+
" <td>...</td>\n",
|
1799 |
+
" <td>...</td>\n",
|
1800 |
+
" <td>...</td>\n",
|
1801 |
+
" <td>...</td>\n",
|
1802 |
+
" <td>...</td>\n",
|
1803 |
+
" <td>...</td>\n",
|
1804 |
+
" <td>...</td>\n",
|
1805 |
+
" </tr>\n",
|
1806 |
+
" <tr>\n",
|
1807 |
+
" <th>43344</th>\n",
|
1808 |
+
" <td>101</td>\n",
|
1809 |
+
" <td>20065</td>\n",
|
1810 |
+
" <td>10161</td>\n",
|
1811 |
+
" <td>115</td>\n",
|
1812 |
+
" <td>115</td>\n",
|
1813 |
+
" <td>103784</td>\n",
|
1814 |
+
" <td>10774</td>\n",
|
1815 |
+
" <td>21388</td>\n",
|
1816 |
+
" <td>10245</td>\n",
|
1817 |
+
" <td>92067</td>\n",
|
1818 |
+
" <td>...</td>\n",
|
1819 |
+
" <td>0</td>\n",
|
1820 |
+
" <td>0</td>\n",
|
1821 |
+
" <td>0</td>\n",
|
1822 |
+
" <td>0</td>\n",
|
1823 |
+
" <td>1</td>\n",
|
1824 |
+
" <td>0.079412</td>\n",
|
1825 |
+
" <td>No</td>\n",
|
1826 |
+
" <td>-1.196769</td>\n",
|
1827 |
+
" <td>No</td>\n",
|
1828 |
+
" <td>Hil**adamlar kesinlikle kelimeleri anlamıyorla...</td>\n",
|
1829 |
+
" </tr>\n",
|
1830 |
+
" <tr>\n",
|
1831 |
+
" <th>43345</th>\n",
|
1832 |
+
" <td>101</td>\n",
|
1833 |
+
" <td>139</td>\n",
|
1834 |
+
" <td>80839</td>\n",
|
1835 |
+
" <td>24109</td>\n",
|
1836 |
+
" <td>13406</td>\n",
|
1837 |
+
" <td>18985</td>\n",
|
1838 |
+
" <td>16285</td>\n",
|
1839 |
+
" <td>10163</td>\n",
|
1840 |
+
" <td>11062</td>\n",
|
1841 |
+
" <td>276</td>\n",
|
1842 |
+
" <td>...</td>\n",
|
1843 |
+
" <td>0</td>\n",
|
1844 |
+
" <td>0</td>\n",
|
1845 |
+
" <td>0</td>\n",
|
1846 |
+
" <td>0</td>\n",
|
1847 |
+
" <td>1</td>\n",
|
1848 |
+
" <td>0.118245</td>\n",
|
1849 |
+
" <td>No</td>\n",
|
1850 |
+
" <td>-1.108304</td>\n",
|
1851 |
+
" <td>No</td>\n",
|
1852 |
+
" <td>Böyle piçlerin çok erken ölmemelerini ve çok f...</td>\n",
|
1853 |
+
" </tr>\n",
|
1854 |
+
" <tr>\n",
|
1855 |
+
" <th>43346</th>\n",
|
1856 |
+
" <td>101</td>\n",
|
1857 |
+
" <td>105549</td>\n",
|
1858 |
+
" <td>102635</td>\n",
|
1859 |
+
" <td>10140</td>\n",
|
1860 |
+
" <td>26943</td>\n",
|
1861 |
+
" <td>11499</td>\n",
|
1862 |
+
" <td>110516</td>\n",
|
1863 |
+
" <td>21899</td>\n",
|
1864 |
+
" <td>11861</td>\n",
|
1865 |
+
" <td>10561</td>\n",
|
1866 |
+
" <td>...</td>\n",
|
1867 |
+
" <td>0</td>\n",
|
1868 |
+
" <td>0</td>\n",
|
1869 |
+
" <td>0</td>\n",
|
1870 |
+
" <td>0</td>\n",
|
1871 |
+
" <td>1</td>\n",
|
1872 |
+
" <td>0.138229</td>\n",
|
1873 |
+
" <td>No</td>\n",
|
1874 |
+
" <td>-1.307328</td>\n",
|
1875 |
+
" <td>No</td>\n",
|
1876 |
+
" <td>Turgay denilen bu holigonda bir sorun yok, gur...</td>\n",
|
1877 |
+
" </tr>\n",
|
1878 |
+
" <tr>\n",
|
1879 |
+
" <th>43347</th>\n",
|
1880 |
+
" <td>101</td>\n",
|
1881 |
+
" <td>81424</td>\n",
|
1882 |
+
" <td>26398</td>\n",
|
1883 |
+
" <td>92017</td>\n",
|
1884 |
+
" <td>109620</td>\n",
|
1885 |
+
" <td>10941</td>\n",
|
1886 |
+
" <td>76010</td>\n",
|
1887 |
+
" <td>10115</td>\n",
|
1888 |
+
" <td>19830</td>\n",
|
1889 |
+
" <td>26083</td>\n",
|
1890 |
+
" <td>...</td>\n",
|
1891 |
+
" <td>0</td>\n",
|
1892 |
+
" <td>0</td>\n",
|
1893 |
+
" <td>0</td>\n",
|
1894 |
+
" <td>0</td>\n",
|
1895 |
+
" <td>1</td>\n",
|
1896 |
+
" <td>0.181065</td>\n",
|
1897 |
+
" <td>No</td>\n",
|
1898 |
+
" <td>-1.127932</td>\n",
|
1899 |
+
" <td>No</td>\n",
|
1900 |
+
" <td>Umarım ülkenin düşük zekadan kurtulması ilgile...</td>\n",
|
1901 |
+
" </tr>\n",
|
1902 |
+
" <tr>\n",
|
1903 |
+
" <th>43348</th>\n",
|
1904 |
+
" <td>101</td>\n",
|
1905 |
+
" <td>39774</td>\n",
|
1906 |
+
" <td>11127</td>\n",
|
1907 |
+
" <td>45989</td>\n",
|
1908 |
+
" <td>24596</td>\n",
|
1909 |
+
" <td>11933</td>\n",
|
1910 |
+
" <td>170</td>\n",
|
1911 |
+
" <td>17145</td>\n",
|
1912 |
+
" <td>10710</td>\n",
|
1913 |
+
" <td>39125</td>\n",
|
1914 |
+
" <td>...</td>\n",
|
1915 |
+
" <td>0</td>\n",
|
1916 |
+
" <td>0</td>\n",
|
1917 |
+
" <td>0</td>\n",
|
1918 |
+
" <td>0</td>\n",
|
1919 |
+
" <td>1</td>\n",
|
1920 |
+
" <td>0.085161</td>\n",
|
1921 |
+
" <td>No</td>\n",
|
1922 |
+
" <td>-1.286323</td>\n",
|
1923 |
+
" <td>No</td>\n",
|
1924 |
+
" <td>CHP sandıkları bırakmaz, üzerine oturur, bir c...</td>\n",
|
1925 |
+
" </tr>\n",
|
1926 |
+
" </tbody>\n",
|
1927 |
+
"</table>\n",
|
1928 |
+
"<p>43029 rows × 70 columns</p>\n",
|
1929 |
+
"</div>"
|
1930 |
+
],
|
1931 |
+
"text/plain": [
|
1932 |
+
" 0 1 2 3 4 5 6 7 8 \n",
|
1933 |
+
"0 101 10110 175 78653 189 25285 15976 40840 276 \\\n",
|
1934 |
+
"1 101 11589 10706 10713 10794 94698 30668 24883 117 \n",
|
1935 |
+
"2 101 148 30471 10774 13785 13779 33642 14399 48271 \n",
|
1936 |
+
"3 101 19319 16724 10118 10107 78323 12407 38959 22934 \n",
|
1937 |
+
"4 101 30932 58706 58054 44907 10224 106583 10288 12524 \n",
|
1938 |
+
"... ... ... ... ... ... ... ... ... ... \n",
|
1939 |
+
"43344 101 20065 10161 115 115 103784 10774 21388 10245 \n",
|
1940 |
+
"43345 101 139 80839 24109 13406 18985 16285 10163 11062 \n",
|
1941 |
+
"43346 101 105549 102635 10140 26943 11499 110516 21899 11861 \n",
|
1942 |
+
"43347 101 81424 26398 92017 109620 10941 76010 10115 19830 \n",
|
1943 |
+
"43348 101 39774 11127 45989 24596 11933 170 17145 10710 \n",
|
1944 |
+
"\n",
|
1945 |
+
" 9 ... 60 61 62 63 labels iso_forest_scores \n",
|
1946 |
+
"0 31623 ... 0 0 0 0 0 0.189202 \\\n",
|
1947 |
+
"1 23763 ... 0 0 0 0 0 0.181234 \n",
|
1948 |
+
"2 76686 ... 0 0 0 0 0 0.166332 \n",
|
1949 |
+
"3 10147 ... 0 0 0 0 0 0.151816 \n",
|
1950 |
+
"4 13878 ... 0 0 0 0 0 0.184008 \n",
|
1951 |
+
"... ... ... .. .. .. .. ... ... \n",
|
1952 |
+
"43344 92067 ... 0 0 0 0 1 0.079412 \n",
|
1953 |
+
"43345 276 ... 0 0 0 0 1 0.118245 \n",
|
1954 |
+
"43346 10561 ... 0 0 0 0 1 0.138229 \n",
|
1955 |
+
"43347 26083 ... 0 0 0 0 1 0.181065 \n",
|
1956 |
+
"43348 39125 ... 0 0 0 0 1 0.085161 \n",
|
1957 |
+
"\n",
|
1958 |
+
" iso_forest_outliers lof_outliers outliers \n",
|
1959 |
+
"0 No -1.209681 No \\\n",
|
1960 |
+
"1 No -1.107479 No \n",
|
1961 |
+
"2 No -1.202529 No \n",
|
1962 |
+
"3 No -1.216599 No \n",
|
1963 |
+
"4 No -1.188488 No \n",
|
1964 |
+
"... ... ... ... \n",
|
1965 |
+
"43344 No -1.196769 No \n",
|
1966 |
+
"43345 No -1.108304 No \n",
|
1967 |
+
"43346 No -1.307328 No \n",
|
1968 |
+
"43347 No -1.127932 No \n",
|
1969 |
+
"43348 No -1.286323 No \n",
|
1970 |
+
"\n",
|
1971 |
+
" tweet \n",
|
1972 |
+
"0 en güzel uyuyan insan ödülü jeon jungkook'a g... \n",
|
1973 |
+
"1 Mekanı cennet olsun, saygılar sayın avukatımı... \n",
|
1974 |
+
"2 Kızlar aranızda kas yığını beylere düşenler ol... \n",
|
1975 |
+
"3 Biraz ders çalışayım. Tembellik ve uyku düşman... \n",
|
1976 |
+
"4 Trezeguet yerine El Sharawy daha iyi olmaz mı \n",
|
1977 |
+
"... ... \n",
|
1978 |
+
"43344 Hil**adamlar kesinlikle kelimeleri anlamıyorla... \n",
|
1979 |
+
"43345 Böyle piçlerin çok erken ölmemelerini ve çok f... \n",
|
1980 |
+
"43346 Turgay denilen bu holigonda bir sorun yok, gur... \n",
|
1981 |
+
"43347 Umarım ülkenin düşük zekadan kurtulması ilgile... \n",
|
1982 |
+
"43348 CHP sandıkları bırakmaz, üzerine oturur, bir c... \n",
|
1983 |
+
"\n",
|
1984 |
+
"[43029 rows x 70 columns]"
|
1985 |
+
]
|
1986 |
+
},
|
1987 |
+
"execution_count": 18,
|
1988 |
+
"metadata": {},
|
1989 |
+
"output_type": "execute_result"
|
1990 |
+
}
|
1991 |
+
],
|
1992 |
+
"source": [
|
1993 |
+
"embeding_df"
|
1994 |
+
]
|
1995 |
+
},
|
1996 |
+
{
|
1997 |
+
"cell_type": "code",
|
1998 |
+
"execution_count": 19,
|
1999 |
+
"metadata": {},
|
2000 |
+
"outputs": [],
|
2001 |
+
"source": [
|
2002 |
+
"# embeding_df.drop(embeding_df.loc[embeding_df['outliers']=='Yes' ].index, inplace=True)\n",
|
2003 |
+
"# embeding_df.drop(embeding_df.loc[embeding_df['iso_forest_outliers']=='Yes' ].index, inplace=True)"
|
2004 |
+
]
|
2005 |
+
},
|
2006 |
+
{
|
2007 |
+
"cell_type": "code",
|
2008 |
+
"execution_count": 20,
|
2009 |
+
"metadata": {},
|
2010 |
+
"outputs": [],
|
2011 |
+
"source": [
|
2012 |
+
"# iso_df=embeding_df[embeding_df['iso_forest_outliers']=='Yes' ]\n",
|
2013 |
+
"# embeding_df.drop(embeding_df.loc[embeding_df['iso_forest_outliers']=='Yes' ].index, inplace=True)"
|
2014 |
+
]
|
2015 |
+
},
|
2016 |
+
{
|
2017 |
+
"cell_type": "code",
|
2018 |
+
"execution_count": 21,
|
2019 |
+
"metadata": {},
|
2020 |
+
"outputs": [],
|
2021 |
+
"source": [
|
2022 |
+
"# lof_df=embeding_df[embeding_df['outliers']=='Yes' ]\n",
|
2023 |
+
"# embeding_df.drop(embeding_df.loc[embeding_df['outliers']=='Yes' ].index, inplace=True)"
|
2024 |
+
]
|
2025 |
+
},
|
2026 |
+
{
|
2027 |
+
"cell_type": "code",
|
2028 |
+
"execution_count": 22,
|
2029 |
+
"metadata": {},
|
2030 |
+
"outputs": [],
|
2031 |
+
"source": [
|
2032 |
+
"# iso_df"
|
2033 |
+
]
|
2034 |
+
},
|
2035 |
+
{
|
2036 |
+
"cell_type": "code",
|
2037 |
+
"execution_count": 23,
|
2038 |
+
"metadata": {},
|
2039 |
+
"outputs": [],
|
2040 |
+
"source": [
|
2041 |
+
"# iso_df['labels']=iso_df['labels'].replace({0: 1, 1: 0})"
|
2042 |
+
]
|
2043 |
+
},
|
2044 |
+
{
|
2045 |
+
"cell_type": "code",
|
2046 |
+
"execution_count": 24,
|
2047 |
+
"metadata": {},
|
2048 |
+
"outputs": [],
|
2049 |
+
"source": [
|
2050 |
+
"# iso_df"
|
2051 |
+
]
|
2052 |
+
},
|
2053 |
+
{
|
2054 |
+
"cell_type": "code",
|
2055 |
+
"execution_count": 25,
|
2056 |
+
"metadata": {},
|
2057 |
+
"outputs": [],
|
2058 |
+
"source": [
|
2059 |
+
"# lof_df['labels']=lof_df['labels'].replace({0: 1, 1: 0})"
|
2060 |
+
]
|
2061 |
+
},
|
2062 |
+
{
|
2063 |
+
"cell_type": "code",
|
2064 |
+
"execution_count": 26,
|
2065 |
+
"metadata": {},
|
2066 |
+
"outputs": [],
|
2067 |
+
"source": [
|
2068 |
+
"# lof_df"
|
2069 |
+
]
|
2070 |
+
},
|
2071 |
+
{
|
2072 |
+
"cell_type": "code",
|
2073 |
+
"execution_count": 27,
|
2074 |
+
"metadata": {},
|
2075 |
+
"outputs": [],
|
2076 |
+
"source": [
|
2077 |
+
"# x=pd.concat([lof_df,iso_df], axis=0)"
|
2078 |
+
]
|
2079 |
+
},
|
2080 |
+
{
|
2081 |
+
"cell_type": "code",
|
2082 |
+
"execution_count": 28,
|
2083 |
+
"metadata": {},
|
2084 |
+
"outputs": [],
|
2085 |
+
"source": [
|
2086 |
+
"# embeding_df=pd.concat([x,embeding_df], axis=0)"
|
2087 |
+
]
|
2088 |
+
},
|
2089 |
+
{
|
2090 |
+
"cell_type": "code",
|
2091 |
+
"execution_count": 29,
|
2092 |
+
"metadata": {},
|
2093 |
+
"outputs": [],
|
2094 |
+
"source": [
|
2095 |
+
"# embeding_df.reset_index()"
|
2096 |
+
]
|
2097 |
+
},
|
2098 |
+
{
|
2099 |
+
"cell_type": "code",
|
2100 |
+
"execution_count": 30,
|
2101 |
+
"metadata": {},
|
2102 |
+
"outputs": [],
|
2103 |
+
"source": [
|
2104 |
+
"# embeding_df=embeding_df.drop(['iso_forest_scores', 'iso_forest_outliers','lof_outliers','outliers'], axis=1)"
|
2105 |
+
]
|
2106 |
+
},
|
2107 |
+
{
|
2108 |
+
"cell_type": "code",
|
2109 |
+
"execution_count": 31,
|
2110 |
+
"metadata": {},
|
2111 |
+
"outputs": [],
|
2112 |
+
"source": [
|
2113 |
+
"embeding_df['0'] = embeding_df[embeding_df.columns[:-1]].apply(lambda x: ','.join(x.dropna().astype(str)),axis=1)\n"
|
2114 |
+
]
|
2115 |
+
},
|
2116 |
+
{
|
2117 |
+
"cell_type": "code",
|
2118 |
+
"execution_count": 32,
|
2119 |
+
"metadata": {},
|
2120 |
+
"outputs": [],
|
2121 |
+
"source": [
|
2122 |
+
"df=pd.DataFrame()\n",
|
2123 |
+
"df['tweet']=embeding_df['tweet']\n",
|
2124 |
+
"df['subtas_a']=embeding_df['labels']\n"
|
2125 |
+
]
|
2126 |
+
},
|
2127 |
+
{
|
2128 |
+
"cell_type": "code",
|
2129 |
+
"execution_count": 33,
|
2130 |
+
"metadata": {},
|
2131 |
+
"outputs": [
|
2132 |
+
{
|
2133 |
+
"data": {
|
2134 |
+
"text/html": [
|
2135 |
+
"<div>\n",
|
2136 |
+
"<style scoped>\n",
|
2137 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
2138 |
+
" vertical-align: middle;\n",
|
2139 |
+
" }\n",
|
2140 |
+
"\n",
|
2141 |
+
" .dataframe tbody tr th {\n",
|
2142 |
+
" vertical-align: top;\n",
|
2143 |
+
" }\n",
|
2144 |
+
"\n",
|
2145 |
+
" .dataframe thead th {\n",
|
2146 |
+
" text-align: right;\n",
|
2147 |
+
" }\n",
|
2148 |
+
"</style>\n",
|
2149 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
2150 |
+
" <thead>\n",
|
2151 |
+
" <tr style=\"text-align: right;\">\n",
|
2152 |
+
" <th></th>\n",
|
2153 |
+
" <th>tweet</th>\n",
|
2154 |
+
" <th>subtas_a</th>\n",
|
2155 |
+
" </tr>\n",
|
2156 |
+
" </thead>\n",
|
2157 |
+
" <tbody>\n",
|
2158 |
+
" <tr>\n",
|
2159 |
+
" <th>0</th>\n",
|
2160 |
+
" <td>en güzel uyuyan insan ödülü jeon jungkook'a g...</td>\n",
|
2161 |
+
" <td>0</td>\n",
|
2162 |
+
" </tr>\n",
|
2163 |
+
" <tr>\n",
|
2164 |
+
" <th>1</th>\n",
|
2165 |
+
" <td>Mekanı cennet olsun, saygılar sayın avukatımı...</td>\n",
|
2166 |
+
" <td>0</td>\n",
|
2167 |
+
" </tr>\n",
|
2168 |
+
" <tr>\n",
|
2169 |
+
" <th>2</th>\n",
|
2170 |
+
" <td>Kızlar aranızda kas yığını beylere düşenler ol...</td>\n",
|
2171 |
+
" <td>0</td>\n",
|
2172 |
+
" </tr>\n",
|
2173 |
+
" <tr>\n",
|
2174 |
+
" <th>3</th>\n",
|
2175 |
+
" <td>Biraz ders çalışayım. Tembellik ve uyku düşman...</td>\n",
|
2176 |
+
" <td>0</td>\n",
|
2177 |
+
" </tr>\n",
|
2178 |
+
" <tr>\n",
|
2179 |
+
" <th>4</th>\n",
|
2180 |
+
" <td>Trezeguet yerine El Sharawy daha iyi olmaz mı</td>\n",
|
2181 |
+
" <td>0</td>\n",
|
2182 |
+
" </tr>\n",
|
2183 |
+
" <tr>\n",
|
2184 |
+
" <th>...</th>\n",
|
2185 |
+
" <td>...</td>\n",
|
2186 |
+
" <td>...</td>\n",
|
2187 |
+
" </tr>\n",
|
2188 |
+
" <tr>\n",
|
2189 |
+
" <th>43344</th>\n",
|
2190 |
+
" <td>Hil**adamlar kesinlikle kelimeleri anlamıyorla...</td>\n",
|
2191 |
+
" <td>1</td>\n",
|
2192 |
+
" </tr>\n",
|
2193 |
+
" <tr>\n",
|
2194 |
+
" <th>43345</th>\n",
|
2195 |
+
" <td>Böyle piçlerin çok erken ölmemelerini ve çok f...</td>\n",
|
2196 |
+
" <td>1</td>\n",
|
2197 |
+
" </tr>\n",
|
2198 |
+
" <tr>\n",
|
2199 |
+
" <th>43346</th>\n",
|
2200 |
+
" <td>Turgay denilen bu holigonda bir sorun yok, gur...</td>\n",
|
2201 |
+
" <td>1</td>\n",
|
2202 |
+
" </tr>\n",
|
2203 |
+
" <tr>\n",
|
2204 |
+
" <th>43347</th>\n",
|
2205 |
+
" <td>Umarım ülkenin düşük zekadan kurtulması ilgile...</td>\n",
|
2206 |
+
" <td>1</td>\n",
|
2207 |
+
" </tr>\n",
|
2208 |
+
" <tr>\n",
|
2209 |
+
" <th>43348</th>\n",
|
2210 |
+
" <td>CHP sandıkları bırakmaz, üzerine oturur, bir c...</td>\n",
|
2211 |
+
" <td>1</td>\n",
|
2212 |
+
" </tr>\n",
|
2213 |
+
" </tbody>\n",
|
2214 |
+
"</table>\n",
|
2215 |
+
"<p>43029 rows × 2 columns</p>\n",
|
2216 |
+
"</div>"
|
2217 |
+
],
|
2218 |
+
"text/plain": [
|
2219 |
+
" tweet subtas_a\n",
|
2220 |
+
"0 en güzel uyuyan insan ödülü jeon jungkook'a g... 0\n",
|
2221 |
+
"1 Mekanı cennet olsun, saygılar sayın avukatımı... 0\n",
|
2222 |
+
"2 Kızlar aranızda kas yığını beylere düşenler ol... 0\n",
|
2223 |
+
"3 Biraz ders çalışayım. Tembellik ve uyku düşman... 0\n",
|
2224 |
+
"4 Trezeguet yerine El Sharawy daha iyi olmaz mı 0\n",
|
2225 |
+
"... ... ...\n",
|
2226 |
+
"43344 Hil**adamlar kesinlikle kelimeleri anlamıyorla... 1\n",
|
2227 |
+
"43345 Böyle piçlerin çok erken ölmemelerini ve çok f... 1\n",
|
2228 |
+
"43346 Turgay denilen bu holigonda bir sorun yok, gur... 1\n",
|
2229 |
+
"43347 Umarım ülkenin düşük zekadan kurtulması ilgile... 1\n",
|
2230 |
+
"43348 CHP sandıkları bırakmaz, üzerine oturur, bir c... 1\n",
|
2231 |
+
"\n",
|
2232 |
+
"[43029 rows x 2 columns]"
|
2233 |
+
]
|
2234 |
+
},
|
2235 |
+
"execution_count": 33,
|
2236 |
+
"metadata": {},
|
2237 |
+
"output_type": "execute_result"
|
2238 |
+
}
|
2239 |
+
],
|
2240 |
+
"source": [
|
2241 |
+
"df"
|
2242 |
+
]
|
2243 |
+
},
|
2244 |
+
{
|
2245 |
+
"cell_type": "code",
|
2246 |
+
"execution_count": 40,
|
2247 |
+
"metadata": {},
|
2248 |
+
"outputs": [],
|
2249 |
+
"source": [
|
2250 |
+
"df.to_csv('inverse_outliers.csv') "
|
2251 |
+
]
|
2252 |
+
},
|
2253 |
+
{
|
2254 |
+
"cell_type": "code",
|
2255 |
+
"execution_count": 34,
|
2256 |
+
"metadata": {},
|
2257 |
+
"outputs": [],
|
2258 |
+
"source": [
|
2259 |
+
"df.to_csv('int_2_outliers.csv') "
|
2260 |
+
]
|
2261 |
+
},
|
2262 |
+
{
|
2263 |
+
"cell_type": "code",
|
2264 |
+
"execution_count": null,
|
2265 |
+
"metadata": {},
|
2266 |
+
"outputs": [],
|
2267 |
+
"source": []
|
2268 |
+
}
|
2269 |
+
],
|
2270 |
+
"metadata": {
|
2271 |
+
"kernelspec": {
|
2272 |
+
"display_name": "dl_env",
|
2273 |
+
"language": "python",
|
2274 |
+
"name": "python3"
|
2275 |
+
},
|
2276 |
+
"language_info": {
|
2277 |
+
"codemirror_mode": {
|
2278 |
+
"name": "ipython",
|
2279 |
+
"version": 3
|
2280 |
+
},
|
2281 |
+
"file_extension": ".py",
|
2282 |
+
"mimetype": "text/x-python",
|
2283 |
+
"name": "python",
|
2284 |
+
"nbconvert_exporter": "python",
|
2285 |
+
"pygments_lexer": "ipython3",
|
2286 |
+
"version": "3.9.0"
|
2287 |
+
},
|
2288 |
+
"orig_nbformat": 4
|
2289 |
+
},
|
2290 |
+
"nbformat": 4,
|
2291 |
+
"nbformat_minor": 2
|
2292 |
+
}
|
pycaret_outlier_detection.ipynb
ADDED
The diff for this file is too large to render.
See raw diff
|
|
requirements.txt
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
pytorch_lightning
|
2 |
+
emoji
|
3 |
+
transformers
|
4 |
+
numpy
|
5 |
+
pandas
|
6 |
+
os
|
7 |
+
random
|
8 |
+
torch
|
9 |
+
torch-metrics
|
10 |
+
torch-utils
|
trainer.ipynb
ADDED
@@ -0,0 +1,1165 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": 1,
|
6 |
+
"metadata": {},
|
7 |
+
"outputs": [],
|
8 |
+
"source": [
|
9 |
+
"import emoji\n",
|
10 |
+
"import numpy as np\n",
|
11 |
+
"import pandas as pd\n",
|
12 |
+
"from sklearn.preprocessing import LabelEncoder\n",
|
13 |
+
"from transformers import AutoTokenizer"
|
14 |
+
]
|
15 |
+
},
|
16 |
+
{
|
17 |
+
"cell_type": "code",
|
18 |
+
"execution_count": 2,
|
19 |
+
"metadata": {},
|
20 |
+
"outputs": [],
|
21 |
+
"source": [
|
22 |
+
"# train_df=pd.read_csv('/DATA/sin-kaf/offenseval-tr-training-v1.tsv',sep='\\t')\n",
|
23 |
+
"# test_df=pd.read_csv('/DATA/sin-kaf/offenseval-tr-testset-v1.tsv',sep='\\t')\n",
|
24 |
+
"# augmented_df=pd.read_csv('augmented_data_offensive.csv')\n",
|
25 |
+
"# selin_df=pd.read_csv('/DATA/sin-kaf/selin_data.csv')"
|
26 |
+
]
|
27 |
+
},
|
28 |
+
{
|
29 |
+
"cell_type": "code",
|
30 |
+
"execution_count": 3,
|
31 |
+
"metadata": {},
|
32 |
+
"outputs": [],
|
33 |
+
"source": [
|
34 |
+
"outliers_df=pd.read_csv('/DATA/sin-kaf/cluster_outliers.csv')\n",
|
35 |
+
"outliers_df=outliers_df.drop(['Unnamed: 0'], axis=1)\n",
|
36 |
+
"outliers_df['subtask_a'] = outliers_df['subtas_a']\n",
|
37 |
+
"outliers_df=outliers_df.drop(['subtas_a'], axis=1)\n"
|
38 |
+
]
|
39 |
+
},
|
40 |
+
{
|
41 |
+
"cell_type": "code",
|
42 |
+
"execution_count": 4,
|
43 |
+
"metadata": {},
|
44 |
+
"outputs": [],
|
45 |
+
"source": [
|
46 |
+
"train_df=outliers_df"
|
47 |
+
]
|
48 |
+
},
|
49 |
+
{
|
50 |
+
"cell_type": "code",
|
51 |
+
"execution_count": 5,
|
52 |
+
"metadata": {},
|
53 |
+
"outputs": [],
|
54 |
+
"source": [
|
55 |
+
"# augmented_df=augmented_df.drop(['Unnamed: 0'], axis=1)\n",
|
56 |
+
"# augmented_df = augmented_df.dropna()\n",
|
57 |
+
"# train_df=pd.concat([train_df,augmented_df], axis=0)\n",
|
58 |
+
"# train_df=pd.concat([train_df,test_df], axis=0)\n",
|
59 |
+
"# train_df=train_df.drop(['id'], axis=1)\n",
|
60 |
+
"data=train_df['tweet'].tolist()\n",
|
61 |
+
"for i in range(len(data)):\n",
|
62 |
+
" data[i] = data[i].replace('@USER','')\n",
|
63 |
+
" data[i] = data[i].replace('#','')\n",
|
64 |
+
" data[i] = data[i].replace('$','')\n",
|
65 |
+
" data[i] = emoji.demojize(data[i])\n",
|
66 |
+
" \n",
|
67 |
+
"train_df['tweet'] = data\n",
|
68 |
+
"lab = LabelEncoder()\n",
|
69 |
+
"train_df['subtask_a'] = lab.fit_transform(train_df['subtask_a'])\n",
|
70 |
+
"df = train_df[train_df.subtask_a != 2]"
|
71 |
+
]
|
72 |
+
},
|
73 |
+
{
|
74 |
+
"cell_type": "code",
|
75 |
+
"execution_count": 6,
|
76 |
+
"metadata": {},
|
77 |
+
"outputs": [
|
78 |
+
{
|
79 |
+
"data": {
|
80 |
+
"text/html": [
|
81 |
+
"<div>\n",
|
82 |
+
"<style scoped>\n",
|
83 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
84 |
+
" vertical-align: middle;\n",
|
85 |
+
" }\n",
|
86 |
+
"\n",
|
87 |
+
" .dataframe tbody tr th {\n",
|
88 |
+
" vertical-align: top;\n",
|
89 |
+
" }\n",
|
90 |
+
"\n",
|
91 |
+
" .dataframe thead th {\n",
|
92 |
+
" text-align: right;\n",
|
93 |
+
" }\n",
|
94 |
+
"</style>\n",
|
95 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
96 |
+
" <thead>\n",
|
97 |
+
" <tr style=\"text-align: right;\">\n",
|
98 |
+
" <th></th>\n",
|
99 |
+
" <th>tweet</th>\n",
|
100 |
+
" <th>subtask_a</th>\n",
|
101 |
+
" </tr>\n",
|
102 |
+
" </thead>\n",
|
103 |
+
" <tbody>\n",
|
104 |
+
" <tr>\n",
|
105 |
+
" <th>0</th>\n",
|
106 |
+
" <td>en güzel uyuyan insan ödülü jeon jungkook'a g...</td>\n",
|
107 |
+
" <td>0</td>\n",
|
108 |
+
" </tr>\n",
|
109 |
+
" <tr>\n",
|
110 |
+
" <th>1</th>\n",
|
111 |
+
" <td>Mekanı cennet olsun, saygılar sayın avukatımı...</td>\n",
|
112 |
+
" <td>0</td>\n",
|
113 |
+
" </tr>\n",
|
114 |
+
" <tr>\n",
|
115 |
+
" <th>2</th>\n",
|
116 |
+
" <td>Kızlar aranızda kas yığını beylere düşenler ol...</td>\n",
|
117 |
+
" <td>0</td>\n",
|
118 |
+
" </tr>\n",
|
119 |
+
" <tr>\n",
|
120 |
+
" <th>3</th>\n",
|
121 |
+
" <td>Biraz ders çalışayım. Tembellik ve uyku düşman...</td>\n",
|
122 |
+
" <td>0</td>\n",
|
123 |
+
" </tr>\n",
|
124 |
+
" <tr>\n",
|
125 |
+
" <th>4</th>\n",
|
126 |
+
" <td>Trezeguet yerine El Sharawy daha iyi olmaz mı</td>\n",
|
127 |
+
" <td>0</td>\n",
|
128 |
+
" </tr>\n",
|
129 |
+
" <tr>\n",
|
130 |
+
" <th>...</th>\n",
|
131 |
+
" <td>...</td>\n",
|
132 |
+
" <td>...</td>\n",
|
133 |
+
" </tr>\n",
|
134 |
+
" <tr>\n",
|
135 |
+
" <th>41177</th>\n",
|
136 |
+
" <td>Hil**adamlar kesinlikle kelimeleri anlamıyorla...</td>\n",
|
137 |
+
" <td>1</td>\n",
|
138 |
+
" </tr>\n",
|
139 |
+
" <tr>\n",
|
140 |
+
" <th>41178</th>\n",
|
141 |
+
" <td>Böyle piçlerin çok erken ölmemelerini ve çok f...</td>\n",
|
142 |
+
" <td>1</td>\n",
|
143 |
+
" </tr>\n",
|
144 |
+
" <tr>\n",
|
145 |
+
" <th>41179</th>\n",
|
146 |
+
" <td>Turgay denilen bu holigonda bir sorun yok, gur...</td>\n",
|
147 |
+
" <td>1</td>\n",
|
148 |
+
" </tr>\n",
|
149 |
+
" <tr>\n",
|
150 |
+
" <th>41180</th>\n",
|
151 |
+
" <td>Umarım ülkenin düşük zekadan kurtulması ilgile...</td>\n",
|
152 |
+
" <td>1</td>\n",
|
153 |
+
" </tr>\n",
|
154 |
+
" <tr>\n",
|
155 |
+
" <th>41181</th>\n",
|
156 |
+
" <td>CHP sandıkları bırakmaz, üzerine oturur, bir c...</td>\n",
|
157 |
+
" <td>1</td>\n",
|
158 |
+
" </tr>\n",
|
159 |
+
" </tbody>\n",
|
160 |
+
"</table>\n",
|
161 |
+
"<p>41182 rows × 2 columns</p>\n",
|
162 |
+
"</div>"
|
163 |
+
],
|
164 |
+
"text/plain": [
|
165 |
+
" tweet subtask_a\n",
|
166 |
+
"0 en güzel uyuyan insan ödülü jeon jungkook'a g... 0\n",
|
167 |
+
"1 Mekanı cennet olsun, saygılar sayın avukatımı... 0\n",
|
168 |
+
"2 Kızlar aranızda kas yığını beylere düşenler ol... 0\n",
|
169 |
+
"3 Biraz ders çalışayım. Tembellik ve uyku düşman... 0\n",
|
170 |
+
"4 Trezeguet yerine El Sharawy daha iyi olmaz mı 0\n",
|
171 |
+
"... ... ...\n",
|
172 |
+
"41177 Hil**adamlar kesinlikle kelimeleri anlamıyorla... 1\n",
|
173 |
+
"41178 Böyle piçlerin çok erken ölmemelerini ve çok f... 1\n",
|
174 |
+
"41179 Turgay denilen bu holigonda bir sorun yok, gur... 1\n",
|
175 |
+
"41180 Umarım ülkenin düşük zekadan kurtulması ilgile... 1\n",
|
176 |
+
"41181 CHP sandıkları bırakmaz, üzerine oturur, bir c... 1\n",
|
177 |
+
"\n",
|
178 |
+
"[41182 rows x 2 columns]"
|
179 |
+
]
|
180 |
+
},
|
181 |
+
"execution_count": 6,
|
182 |
+
"metadata": {},
|
183 |
+
"output_type": "execute_result"
|
184 |
+
}
|
185 |
+
],
|
186 |
+
"source": [
|
187 |
+
"train_df"
|
188 |
+
]
|
189 |
+
},
|
190 |
+
{
|
191 |
+
"cell_type": "code",
|
192 |
+
"execution_count": 7,
|
193 |
+
"metadata": {},
|
194 |
+
"outputs": [],
|
195 |
+
"source": [
|
196 |
+
"# train_df=pd.concat([train_df,selin_df], axis=0)"
|
197 |
+
]
|
198 |
+
},
|
199 |
+
{
|
200 |
+
"cell_type": "code",
|
201 |
+
"execution_count": 8,
|
202 |
+
"metadata": {},
|
203 |
+
"outputs": [
|
204 |
+
{
|
205 |
+
"data": {
|
206 |
+
"text/html": [
|
207 |
+
"<div>\n",
|
208 |
+
"<style scoped>\n",
|
209 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
210 |
+
" vertical-align: middle;\n",
|
211 |
+
" }\n",
|
212 |
+
"\n",
|
213 |
+
" .dataframe tbody tr th {\n",
|
214 |
+
" vertical-align: top;\n",
|
215 |
+
" }\n",
|
216 |
+
"\n",
|
217 |
+
" .dataframe thead th {\n",
|
218 |
+
" text-align: right;\n",
|
219 |
+
" }\n",
|
220 |
+
"</style>\n",
|
221 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
222 |
+
" <thead>\n",
|
223 |
+
" <tr style=\"text-align: right;\">\n",
|
224 |
+
" <th></th>\n",
|
225 |
+
" <th>tweet</th>\n",
|
226 |
+
" <th>subtask_a</th>\n",
|
227 |
+
" </tr>\n",
|
228 |
+
" </thead>\n",
|
229 |
+
" <tbody>\n",
|
230 |
+
" <tr>\n",
|
231 |
+
" <th>0</th>\n",
|
232 |
+
" <td>en güzel uyuyan insan ödülü jeon jungkook'a g...</td>\n",
|
233 |
+
" <td>0</td>\n",
|
234 |
+
" </tr>\n",
|
235 |
+
" <tr>\n",
|
236 |
+
" <th>1</th>\n",
|
237 |
+
" <td>Mekanı cennet olsun, saygılar sayın avukatımı...</td>\n",
|
238 |
+
" <td>0</td>\n",
|
239 |
+
" </tr>\n",
|
240 |
+
" <tr>\n",
|
241 |
+
" <th>2</th>\n",
|
242 |
+
" <td>Kızlar aranızda kas yığını beylere düşenler ol...</td>\n",
|
243 |
+
" <td>0</td>\n",
|
244 |
+
" </tr>\n",
|
245 |
+
" <tr>\n",
|
246 |
+
" <th>3</th>\n",
|
247 |
+
" <td>Biraz ders çalışayım. Tembellik ve uyku düşman...</td>\n",
|
248 |
+
" <td>0</td>\n",
|
249 |
+
" </tr>\n",
|
250 |
+
" <tr>\n",
|
251 |
+
" <th>4</th>\n",
|
252 |
+
" <td>Trezeguet yerine El Sharawy daha iyi olmaz mı</td>\n",
|
253 |
+
" <td>0</td>\n",
|
254 |
+
" </tr>\n",
|
255 |
+
" <tr>\n",
|
256 |
+
" <th>...</th>\n",
|
257 |
+
" <td>...</td>\n",
|
258 |
+
" <td>...</td>\n",
|
259 |
+
" </tr>\n",
|
260 |
+
" <tr>\n",
|
261 |
+
" <th>41177</th>\n",
|
262 |
+
" <td>Hil**adamlar kesinlikle kelimeleri anlamıyorla...</td>\n",
|
263 |
+
" <td>1</td>\n",
|
264 |
+
" </tr>\n",
|
265 |
+
" <tr>\n",
|
266 |
+
" <th>41178</th>\n",
|
267 |
+
" <td>Böyle piçlerin çok erken ölmemelerini ve çok f...</td>\n",
|
268 |
+
" <td>1</td>\n",
|
269 |
+
" </tr>\n",
|
270 |
+
" <tr>\n",
|
271 |
+
" <th>41179</th>\n",
|
272 |
+
" <td>Turgay denilen bu holigonda bir sorun yok, gur...</td>\n",
|
273 |
+
" <td>1</td>\n",
|
274 |
+
" </tr>\n",
|
275 |
+
" <tr>\n",
|
276 |
+
" <th>41180</th>\n",
|
277 |
+
" <td>Umarım ülkenin düşük zekadan kurtulması ilgile...</td>\n",
|
278 |
+
" <td>1</td>\n",
|
279 |
+
" </tr>\n",
|
280 |
+
" <tr>\n",
|
281 |
+
" <th>41181</th>\n",
|
282 |
+
" <td>CHP sandıkları bırakmaz, üzerine oturur, bir c...</td>\n",
|
283 |
+
" <td>1</td>\n",
|
284 |
+
" </tr>\n",
|
285 |
+
" </tbody>\n",
|
286 |
+
"</table>\n",
|
287 |
+
"<p>41182 rows × 2 columns</p>\n",
|
288 |
+
"</div>"
|
289 |
+
],
|
290 |
+
"text/plain": [
|
291 |
+
" tweet subtask_a\n",
|
292 |
+
"0 en güzel uyuyan insan ödülü jeon jungkook'a g... 0\n",
|
293 |
+
"1 Mekanı cennet olsun, saygılar sayın avukatımı... 0\n",
|
294 |
+
"2 Kızlar aranızda kas yığını beylere düşenler ol... 0\n",
|
295 |
+
"3 Biraz ders çalışayım. Tembellik ve uyku düşman... 0\n",
|
296 |
+
"4 Trezeguet yerine El Sharawy daha iyi olmaz mı 0\n",
|
297 |
+
"... ... ...\n",
|
298 |
+
"41177 Hil**adamlar kesinlikle kelimeleri anlamıyorla... 1\n",
|
299 |
+
"41178 Böyle piçlerin çok erken ölmemelerini ve çok f... 1\n",
|
300 |
+
"41179 Turgay denilen bu holigonda bir sorun yok, gur... 1\n",
|
301 |
+
"41180 Umarım ülkenin düşük zekadan kurtulması ilgile... 1\n",
|
302 |
+
"41181 CHP sandıkları bırakmaz, üzerine oturur, bir c... 1\n",
|
303 |
+
"\n",
|
304 |
+
"[41182 rows x 2 columns]"
|
305 |
+
]
|
306 |
+
},
|
307 |
+
"execution_count": 8,
|
308 |
+
"metadata": {},
|
309 |
+
"output_type": "execute_result"
|
310 |
+
}
|
311 |
+
],
|
312 |
+
"source": [
|
313 |
+
"train_df"
|
314 |
+
]
|
315 |
+
},
|
316 |
+
{
|
317 |
+
"cell_type": "code",
|
318 |
+
"execution_count": 9,
|
319 |
+
"metadata": {},
|
320 |
+
"outputs": [],
|
321 |
+
"source": [
|
322 |
+
"train_df = df.sample(frac = 0.7, random_state = 200)\n",
|
323 |
+
"df_2 = df.drop(train_df.index)\n",
|
324 |
+
"test_df = df_2.sample(frac = 0.15, random_state = 200)\n",
|
325 |
+
"val_df = df_2.drop(test_df.index)"
|
326 |
+
]
|
327 |
+
},
|
328 |
+
{
|
329 |
+
"cell_type": "code",
|
330 |
+
"execution_count": 10,
|
331 |
+
"metadata": {},
|
332 |
+
"outputs": [],
|
333 |
+
"source": [
|
334 |
+
"text_train = train_df.tweet.values\n",
|
335 |
+
"label_train = train_df.subtask_a.values"
|
336 |
+
]
|
337 |
+
},
|
338 |
+
{
|
339 |
+
"cell_type": "code",
|
340 |
+
"execution_count": 11,
|
341 |
+
"metadata": {},
|
342 |
+
"outputs": [],
|
343 |
+
"source": [
|
344 |
+
"text_test = test_df.tweet.values\n",
|
345 |
+
"label_test = test_df.subtask_a.values"
|
346 |
+
]
|
347 |
+
},
|
348 |
+
{
|
349 |
+
"cell_type": "code",
|
350 |
+
"execution_count": 12,
|
351 |
+
"metadata": {},
|
352 |
+
"outputs": [],
|
353 |
+
"source": [
|
354 |
+
"text_val = val_df.tweet.values\n",
|
355 |
+
"label_val = val_df.subtask_a.values"
|
356 |
+
]
|
357 |
+
},
|
358 |
+
{
|
359 |
+
"cell_type": "code",
|
360 |
+
"execution_count": 13,
|
361 |
+
"metadata": {},
|
362 |
+
"outputs": [],
|
363 |
+
"source": [
|
364 |
+
"from datasets.dataset_dict import DatasetDict\n",
|
365 |
+
"from datasets import Dataset\n",
|
366 |
+
"dataset={'train':Dataset.from_dict({'label':label_train,'text':text_train}),\n",
|
367 |
+
" 'val':Dataset.from_dict({'label':label_val,'text':text_val}),\n",
|
368 |
+
" 'test':Dataset.from_dict({'label':label_test,'text':text_test})\n",
|
369 |
+
" }\n",
|
370 |
+
"dataset = DatasetDict(dataset)"
|
371 |
+
]
|
372 |
+
},
|
373 |
+
{
|
374 |
+
"cell_type": "code",
|
375 |
+
"execution_count": 14,
|
376 |
+
"metadata": {},
|
377 |
+
"outputs": [],
|
378 |
+
"source": [
|
379 |
+
"# tokenizer = AutoTokenizer.from_pretrained(\"dbmdz/bert-base-turkish-128k-uncased\")\n",
|
380 |
+
"# tokenizer = AutoTokenizer.from_pretrained(\"dbmdz/distilbert-base-turkish-cased\")\n",
|
381 |
+
"tokenizer = AutoTokenizer.from_pretrained(\"Overfit-GM/distilbert-base-turkish-cased-offensive\")\n",
|
382 |
+
"# tokenizer = AutoTokenizer.from_pretrained(\"Overfit-GM/distilbert-base-turkish-cased-offensive\",max_length=208,padding=\"max_length\",truncation=True,return_tensors=\"pt\",add_special_tokens=True,)\n",
|
383 |
+
"# tokenizer = AutoTokenizer.from_pretrained(\"stage_f/pretrain_mlm_distilbert-base-turkish-cased\")\n",
|
384 |
+
"def tokenize_function(examples):\n",
|
385 |
+
" return tokenizer(examples[\"text\"], padding=\"max_length\", truncation=True)"
|
386 |
+
]
|
387 |
+
},
|
388 |
+
{
|
389 |
+
"cell_type": "code",
|
390 |
+
"execution_count": 15,
|
391 |
+
"metadata": {},
|
392 |
+
"outputs": [
|
393 |
+
{
|
394 |
+
"data": {
|
395 |
+
"application/vnd.jupyter.widget-view+json": {
|
396 |
+
"model_id": "5fba4c9671724e9a93d6ad14a1427345",
|
397 |
+
"version_major": 2,
|
398 |
+
"version_minor": 0
|
399 |
+
},
|
400 |
+
"text/plain": [
|
401 |
+
"Map: 0%| | 0/28827 [00:00<?, ? examples/s]"
|
402 |
+
]
|
403 |
+
},
|
404 |
+
"metadata": {},
|
405 |
+
"output_type": "display_data"
|
406 |
+
},
|
407 |
+
{
|
408 |
+
"data": {
|
409 |
+
"application/vnd.jupyter.widget-view+json": {
|
410 |
+
"model_id": "2fff446f4f094d2fb66da549a49ad8a4",
|
411 |
+
"version_major": 2,
|
412 |
+
"version_minor": 0
|
413 |
+
},
|
414 |
+
"text/plain": [
|
415 |
+
"Map: 0%| | 0/10502 [00:00<?, ? examples/s]"
|
416 |
+
]
|
417 |
+
},
|
418 |
+
"metadata": {},
|
419 |
+
"output_type": "display_data"
|
420 |
+
},
|
421 |
+
{
|
422 |
+
"data": {
|
423 |
+
"application/vnd.jupyter.widget-view+json": {
|
424 |
+
"model_id": "675f3b595b21489abaca01453c06db2c",
|
425 |
+
"version_major": 2,
|
426 |
+
"version_minor": 0
|
427 |
+
},
|
428 |
+
"text/plain": [
|
429 |
+
"Map: 0%| | 0/1853 [00:00<?, ? examples/s]"
|
430 |
+
]
|
431 |
+
},
|
432 |
+
"metadata": {},
|
433 |
+
"output_type": "display_data"
|
434 |
+
}
|
435 |
+
],
|
436 |
+
"source": [
|
437 |
+
"tokenized_datasets = dataset.map(tokenize_function, batched=True)"
|
438 |
+
]
|
439 |
+
},
|
440 |
+
{
|
441 |
+
"cell_type": "code",
|
442 |
+
"execution_count": 16,
|
443 |
+
"metadata": {},
|
444 |
+
"outputs": [],
|
445 |
+
"source": [
|
446 |
+
"small_train_dataset = tokenized_datasets[\"train\"].shuffle(seed=42)\n",
|
447 |
+
"small_eval_dataset = tokenized_datasets[\"test\"].shuffle(seed=42)"
|
448 |
+
]
|
449 |
+
},
|
450 |
+
{
|
451 |
+
"cell_type": "code",
|
452 |
+
"execution_count": 17,
|
453 |
+
"metadata": {},
|
454 |
+
"outputs": [
|
455 |
+
{
|
456 |
+
"data": {
|
457 |
+
"text/plain": [
|
458 |
+
"Dataset({\n",
|
459 |
+
" features: ['label', 'text', 'input_ids', 'attention_mask'],\n",
|
460 |
+
" num_rows: 28827\n",
|
461 |
+
"})"
|
462 |
+
]
|
463 |
+
},
|
464 |
+
"execution_count": 17,
|
465 |
+
"metadata": {},
|
466 |
+
"output_type": "execute_result"
|
467 |
+
}
|
468 |
+
],
|
469 |
+
"source": [
|
470 |
+
"small_train_dataset"
|
471 |
+
]
|
472 |
+
},
|
473 |
+
{
|
474 |
+
"cell_type": "code",
|
475 |
+
"execution_count": 18,
|
476 |
+
"metadata": {},
|
477 |
+
"outputs": [
|
478 |
+
{
|
479 |
+
"data": {
|
480 |
+
"text/plain": [
|
481 |
+
"Dataset({\n",
|
482 |
+
" features: ['label', 'text', 'input_ids', 'attention_mask'],\n",
|
483 |
+
" num_rows: 1853\n",
|
484 |
+
"})"
|
485 |
+
]
|
486 |
+
},
|
487 |
+
"execution_count": 18,
|
488 |
+
"metadata": {},
|
489 |
+
"output_type": "execute_result"
|
490 |
+
}
|
491 |
+
],
|
492 |
+
"source": [
|
493 |
+
"small_eval_dataset"
|
494 |
+
]
|
495 |
+
},
|
496 |
+
{
|
497 |
+
"cell_type": "code",
|
498 |
+
"execution_count": 19,
|
499 |
+
"metadata": {},
|
500 |
+
"outputs": [
|
501 |
+
{
|
502 |
+
"name": "stderr",
|
503 |
+
"output_type": "stream",
|
504 |
+
"text": [
|
505 |
+
"Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at Overfit-GM/distilbert-base-turkish-cased-offensive and are newly initialized because the shapes did not match:\n",
|
506 |
+
"- classifier.weight: found shape torch.Size([5, 768]) in the checkpoint and torch.Size([2, 768]) in the model instantiated\n",
|
507 |
+
"- classifier.bias: found shape torch.Size([5]) in the checkpoint and torch.Size([2]) in the model instantiated\n",
|
508 |
+
"You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
|
509 |
+
]
|
510 |
+
}
|
511 |
+
],
|
512 |
+
"source": [
|
513 |
+
"from transformers import AutoModelForSequenceClassification\n",
|
514 |
+
"\n",
|
515 |
+
"# model = AutoModelForSequenceClassification.from_pretrained(\"dbmdz/bert-base-turkish-128k-uncased\",num_labels = 2)\n",
|
516 |
+
"# model = AutoModelForSequenceClassification.from_pretrained(\"dbmdz/distilbert-base-turkish-cased\",num_labels = 2)\n",
|
517 |
+
"# model = AutoModelForSequenceClassification.from_pretrained(\"Overfit-GM/distilbert-base-turkish-cased-offensive\",num_labels = 2, ignore_mismatched_sizes=True)\n",
|
518 |
+
"model = AutoModelForSequenceClassification.from_pretrained(\"Overfit-GM/distilbert-base-turkish-cased-offensive\",num_labels = 2, ignore_mismatched_sizes=True)"
|
519 |
+
]
|
520 |
+
},
|
521 |
+
{
|
522 |
+
"cell_type": "code",
|
523 |
+
"execution_count": 20,
|
524 |
+
"metadata": {},
|
525 |
+
"outputs": [],
|
526 |
+
"source": [
|
527 |
+
"from transformers import TrainingArguments\n",
|
528 |
+
"\n",
|
529 |
+
"training_args = TrainingArguments(output_dir=\"test_trainer\")"
|
530 |
+
]
|
531 |
+
},
|
532 |
+
{
|
533 |
+
"cell_type": "code",
|
534 |
+
"execution_count": 21,
|
535 |
+
"metadata": {},
|
536 |
+
"outputs": [],
|
537 |
+
"source": [
|
538 |
+
"# import numpy as np\n",
|
539 |
+
"# import evaluate\n",
|
540 |
+
"\n",
|
541 |
+
"# # metric = evaluate.load(\"accuracy\")\n",
|
542 |
+
"# # confusion_matrix = evaluate.load(\"BucketHeadP65/confusion_matrix\")\n",
|
543 |
+
"# # metric = evaluate.combine([\"accuracy\", \"f1\", \"precision\", \"recall\", \"confusion_matrix\"])\n",
|
544 |
+
"# metric = evaluate.combine([\"accuracy\", \"f1\", \"precision\", \"recall\"])"
|
545 |
+
]
|
546 |
+
},
|
547 |
+
{
|
548 |
+
"cell_type": "code",
|
549 |
+
"execution_count": 22,
|
550 |
+
"metadata": {},
|
551 |
+
"outputs": [],
|
552 |
+
"source": [
|
553 |
+
"import numpy as np\n",
|
554 |
+
"import evaluate\n",
|
555 |
+
"\n",
|
556 |
+
"metric = evaluate.combine([\"accuracy\", \"f1\", \"precision\", \"recall\"])\n",
|
557 |
+
"conf_matrix = evaluate.load(\"BucketHeadP65/confusion_matrix\")"
|
558 |
+
]
|
559 |
+
},
|
560 |
+
{
|
561 |
+
"cell_type": "code",
|
562 |
+
"execution_count": 23,
|
563 |
+
"metadata": {},
|
564 |
+
"outputs": [],
|
565 |
+
"source": [
|
566 |
+
"def compute_metrics(eval_pred):\n",
|
567 |
+
" logits, labels = eval_pred\n",
|
568 |
+
" predictions = np.argmax(logits, axis=-1)\n",
|
569 |
+
" print(conf_matrix.compute(predictions=predictions, references=labels))\n",
|
570 |
+
" return metric.compute(predictions=predictions, references=labels)"
|
571 |
+
]
|
572 |
+
},
|
573 |
+
{
|
574 |
+
"cell_type": "code",
|
575 |
+
"execution_count": 24,
|
576 |
+
"metadata": {},
|
577 |
+
"outputs": [],
|
578 |
+
"source": [
|
579 |
+
"from transformers import TrainingArguments, Trainer\n",
|
580 |
+
"from pytorch_lightning.loggers import TensorBoardLogger,MLFlowLogger\n",
|
581 |
+
"\n",
|
582 |
+
"training_args = TrainingArguments(output_dir=\"test_trainer\", evaluation_strategy=\"epoch\", num_train_epochs = 5, logging_dir ='TensorBoard',report_to ='mlflow')"
|
583 |
+
]
|
584 |
+
},
|
585 |
+
{
|
586 |
+
"cell_type": "code",
|
587 |
+
"execution_count": 25,
|
588 |
+
"metadata": {},
|
589 |
+
"outputs": [],
|
590 |
+
"source": [
|
591 |
+
"trainer = Trainer(\n",
|
592 |
+
" model=model,\n",
|
593 |
+
" args=training_args,\n",
|
594 |
+
" train_dataset=small_train_dataset,\n",
|
595 |
+
" eval_dataset=small_eval_dataset,\n",
|
596 |
+
" compute_metrics=compute_metrics,\n",
|
597 |
+
")"
|
598 |
+
]
|
599 |
+
},
|
600 |
+
{
|
601 |
+
"cell_type": "code",
|
602 |
+
"execution_count": 26,
|
603 |
+
"metadata": {},
|
604 |
+
"outputs": [
|
605 |
+
{
|
606 |
+
"name": "stderr",
|
607 |
+
"output_type": "stream",
|
608 |
+
"text": [
|
609 |
+
"huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
|
610 |
+
"To disable this warning, you can either:\n",
|
611 |
+
"\t- Avoid using `tokenizers` before the fork if possible\n",
|
612 |
+
"\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
|
613 |
+
"huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
|
614 |
+
"To disable this warning, you can either:\n",
|
615 |
+
"\t- Avoid using `tokenizers` before the fork if possible\n",
|
616 |
+
"\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
|
617 |
+
"huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
|
618 |
+
"To disable this warning, you can either:\n",
|
619 |
+
"\t- Avoid using `tokenizers` before the fork if possible\n",
|
620 |
+
"\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"
|
621 |
+
]
|
622 |
+
},
|
623 |
+
{
|
624 |
+
"data": {
|
625 |
+
"application/vnd.jupyter.widget-view+json": {
|
626 |
+
"model_id": "a38121a009be4a0f90e30fc9c0cf49ed",
|
627 |
+
"version_major": 2,
|
628 |
+
"version_minor": 0
|
629 |
+
},
|
630 |
+
"text/plain": [
|
631 |
+
" 0%| | 0/18020 [00:00<?, ?it/s]"
|
632 |
+
]
|
633 |
+
},
|
634 |
+
"metadata": {},
|
635 |
+
"output_type": "display_data"
|
636 |
+
},
|
637 |
+
{
|
638 |
+
"name": "stdout",
|
639 |
+
"output_type": "stream",
|
640 |
+
"text": [
|
641 |
+
"{'loss': 0.4638, 'learning_rate': 4.86126526082131e-05, 'epoch': 0.14}\n",
|
642 |
+
"{'loss': 0.3886, 'learning_rate': 4.72253052164262e-05, 'epoch': 0.28}\n",
|
643 |
+
"{'loss': 0.3893, 'learning_rate': 4.583795782463929e-05, 'epoch': 0.42}\n",
|
644 |
+
"{'loss': 0.3594, 'learning_rate': 4.445061043285239e-05, 'epoch': 0.55}\n",
|
645 |
+
"{'loss': 0.3547, 'learning_rate': 4.306326304106548e-05, 'epoch': 0.69}\n",
|
646 |
+
"{'loss': 0.3384, 'learning_rate': 4.167591564927858e-05, 'epoch': 0.83}\n",
|
647 |
+
"{'loss': 0.3498, 'learning_rate': 4.028856825749168e-05, 'epoch': 0.97}\n"
|
648 |
+
]
|
649 |
+
},
|
650 |
+
{
|
651 |
+
"data": {
|
652 |
+
"application/vnd.jupyter.widget-view+json": {
|
653 |
+
"model_id": "94ab139e1ebb482da2111517ad5a3a78",
|
654 |
+
"version_major": 2,
|
655 |
+
"version_minor": 0
|
656 |
+
},
|
657 |
+
"text/plain": [
|
658 |
+
" 0%| | 0/232 [00:00<?, ?it/s]"
|
659 |
+
]
|
660 |
+
},
|
661 |
+
"metadata": {},
|
662 |
+
"output_type": "display_data"
|
663 |
+
},
|
664 |
+
{
|
665 |
+
"name": "stdout",
|
666 |
+
"output_type": "stream",
|
667 |
+
"text": [
|
668 |
+
"{'confusion_matrix': array([[966, 90],\n",
|
669 |
+
" [118, 679]])}\n",
|
670 |
+
"{'eval_loss': 0.28741681575775146, 'eval_accuracy': 0.8877495952509444, 'eval_f1': 0.8671775223499362, 'eval_precision': 0.88296488946684, 'eval_recall': 0.8519447929736512, 'eval_runtime': 11.4928, 'eval_samples_per_second': 161.231, 'eval_steps_per_second': 20.186, 'epoch': 1.0}\n",
|
671 |
+
"{'loss': 0.2449, 'learning_rate': 3.890122086570477e-05, 'epoch': 1.11}\n",
|
672 |
+
"{'loss': 0.2178, 'learning_rate': 3.751387347391787e-05, 'epoch': 1.25}\n",
|
673 |
+
"{'loss': 0.2431, 'learning_rate': 3.612652608213097e-05, 'epoch': 1.39}\n",
|
674 |
+
"{'loss': 0.2261, 'learning_rate': 3.4739178690344064e-05, 'epoch': 1.53}\n",
|
675 |
+
"{'loss': 0.2365, 'learning_rate': 3.3351831298557165e-05, 'epoch': 1.66}\n",
|
676 |
+
"{'loss': 0.2169, 'learning_rate': 3.196448390677026e-05, 'epoch': 1.8}\n",
|
677 |
+
"{'loss': 0.222, 'learning_rate': 3.0577136514983354e-05, 'epoch': 1.94}\n"
|
678 |
+
]
|
679 |
+
},
|
680 |
+
{
|
681 |
+
"data": {
|
682 |
+
"application/vnd.jupyter.widget-view+json": {
|
683 |
+
"model_id": "063c47c6cae0467194d4c0827e67c277",
|
684 |
+
"version_major": 2,
|
685 |
+
"version_minor": 0
|
686 |
+
},
|
687 |
+
"text/plain": [
|
688 |
+
" 0%| | 0/232 [00:00<?, ?it/s]"
|
689 |
+
]
|
690 |
+
},
|
691 |
+
"metadata": {},
|
692 |
+
"output_type": "display_data"
|
693 |
+
},
|
694 |
+
{
|
695 |
+
"name": "stdout",
|
696 |
+
"output_type": "stream",
|
697 |
+
"text": [
|
698 |
+
"{'confusion_matrix': array([[900, 156],\n",
|
699 |
+
" [ 76, 721]])}\n",
|
700 |
+
"{'eval_loss': 0.47509443759918213, 'eval_accuracy': 0.8747976254722072, 'eval_f1': 0.8614097968936678, 'eval_precision': 0.82212086659065, 'eval_recall': 0.904642409033877, 'eval_runtime': 11.6203, 'eval_samples_per_second': 159.462, 'eval_steps_per_second': 19.965, 'epoch': 2.0}\n",
|
701 |
+
"{'loss': 0.146, 'learning_rate': 2.918978912319645e-05, 'epoch': 2.08}\n",
|
702 |
+
"{'loss': 0.1163, 'learning_rate': 2.7802441731409544e-05, 'epoch': 2.22}\n",
|
703 |
+
"{'loss': 0.1008, 'learning_rate': 2.641509433962264e-05, 'epoch': 2.36}\n",
|
704 |
+
"{'loss': 0.0967, 'learning_rate': 2.502774694783574e-05, 'epoch': 2.5}\n",
|
705 |
+
"{'loss': 0.1456, 'learning_rate': 2.3640399556048838e-05, 'epoch': 2.64}\n",
|
706 |
+
"{'loss': 0.1178, 'learning_rate': 2.2253052164261932e-05, 'epoch': 2.77}\n",
|
707 |
+
"{'loss': 0.1155, 'learning_rate': 2.0865704772475027e-05, 'epoch': 2.91}\n"
|
708 |
+
]
|
709 |
+
},
|
710 |
+
{
|
711 |
+
"data": {
|
712 |
+
"application/vnd.jupyter.widget-view+json": {
|
713 |
+
"model_id": "4fa52dfbbae54cde8c627a237bed51bc",
|
714 |
+
"version_major": 2,
|
715 |
+
"version_minor": 0
|
716 |
+
},
|
717 |
+
"text/plain": [
|
718 |
+
" 0%| | 0/232 [00:00<?, ?it/s]"
|
719 |
+
]
|
720 |
+
},
|
721 |
+
"metadata": {},
|
722 |
+
"output_type": "display_data"
|
723 |
+
},
|
724 |
+
{
|
725 |
+
"name": "stdout",
|
726 |
+
"output_type": "stream",
|
727 |
+
"text": [
|
728 |
+
"{'confusion_matrix': array([[954, 102],\n",
|
729 |
+
" [106, 691]])}\n",
|
730 |
+
"{'eval_loss': 0.5530020594596863, 'eval_accuracy': 0.8877495952509444, 'eval_f1': 0.8691823899371071, 'eval_precision': 0.8713745271122321, 'eval_recall': 0.8670012547051443, 'eval_runtime': 11.6026, 'eval_samples_per_second': 159.706, 'eval_steps_per_second': 19.996, 'epoch': 3.0}\n",
|
731 |
+
"{'loss': 0.0879, 'learning_rate': 1.9478357380688125e-05, 'epoch': 3.05}\n",
|
732 |
+
"{'loss': 0.0351, 'learning_rate': 1.8091009988901223e-05, 'epoch': 3.19}\n",
|
733 |
+
"{'loss': 0.0501, 'learning_rate': 1.670366259711432e-05, 'epoch': 3.33}\n",
|
734 |
+
"{'loss': 0.0425, 'learning_rate': 1.5316315205327412e-05, 'epoch': 3.47}\n",
|
735 |
+
"{'loss': 0.0564, 'learning_rate': 1.392896781354051e-05, 'epoch': 3.61}\n",
|
736 |
+
"{'loss': 0.05, 'learning_rate': 1.2541620421753608e-05, 'epoch': 3.75}\n",
|
737 |
+
"{'loss': 0.034, 'learning_rate': 1.1154273029966705e-05, 'epoch': 3.88}\n"
|
738 |
+
]
|
739 |
+
},
|
740 |
+
{
|
741 |
+
"data": {
|
742 |
+
"application/vnd.jupyter.widget-view+json": {
|
743 |
+
"model_id": "a9b754cd0e7641cb8d8023f28bc32a06",
|
744 |
+
"version_major": 2,
|
745 |
+
"version_minor": 0
|
746 |
+
},
|
747 |
+
"text/plain": [
|
748 |
+
" 0%| | 0/232 [00:00<?, ?it/s]"
|
749 |
+
]
|
750 |
+
},
|
751 |
+
"metadata": {},
|
752 |
+
"output_type": "display_data"
|
753 |
+
},
|
754 |
+
{
|
755 |
+
"name": "stdout",
|
756 |
+
"output_type": "stream",
|
757 |
+
"text": [
|
758 |
+
"{'confusion_matrix': array([[966, 90],\n",
|
759 |
+
" [109, 688]])}\n",
|
760 |
+
"{'eval_loss': 0.824292778968811, 'eval_accuracy': 0.8926065839179709, 'eval_f1': 0.8736507936507937, 'eval_precision': 0.884318766066838, 'eval_recall': 0.863237139272271, 'eval_runtime': 11.6185, 'eval_samples_per_second': 159.487, 'eval_steps_per_second': 19.968, 'epoch': 4.0}\n",
|
761 |
+
"{'loss': 0.0354, 'learning_rate': 9.766925638179801e-06, 'epoch': 4.02}\n",
|
762 |
+
"{'loss': 0.0165, 'learning_rate': 8.379578246392897e-06, 'epoch': 4.16}\n",
|
763 |
+
"{'loss': 0.0119, 'learning_rate': 6.992230854605994e-06, 'epoch': 4.3}\n",
|
764 |
+
"{'loss': 0.0145, 'learning_rate': 5.60488346281909e-06, 'epoch': 4.44}\n",
|
765 |
+
"{'loss': 0.0169, 'learning_rate': 4.217536071032187e-06, 'epoch': 4.58}\n",
|
766 |
+
"{'loss': 0.0132, 'learning_rate': 2.830188679245283e-06, 'epoch': 4.72}\n",
|
767 |
+
"{'loss': 0.0232, 'learning_rate': 1.4428412874583796e-06, 'epoch': 4.86}\n",
|
768 |
+
"{'loss': 0.0189, 'learning_rate': 5.549389567147614e-08, 'epoch': 4.99}\n"
|
769 |
+
]
|
770 |
+
},
|
771 |
+
{
|
772 |
+
"data": {
|
773 |
+
"application/vnd.jupyter.widget-view+json": {
|
774 |
+
"model_id": "e66e5b59c6ba42ae9939f55dcda3c877",
|
775 |
+
"version_major": 2,
|
776 |
+
"version_minor": 0
|
777 |
+
},
|
778 |
+
"text/plain": [
|
779 |
+
" 0%| | 0/232 [00:00<?, ?it/s]"
|
780 |
+
]
|
781 |
+
},
|
782 |
+
"metadata": {},
|
783 |
+
"output_type": "display_data"
|
784 |
+
},
|
785 |
+
{
|
786 |
+
"name": "stdout",
|
787 |
+
"output_type": "stream",
|
788 |
+
"text": [
|
789 |
+
"{'confusion_matrix': array([[955, 101],\n",
|
790 |
+
" [111, 686]])}\n",
|
791 |
+
"{'eval_loss': 0.937654972076416, 'eval_accuracy': 0.8855909336211549, 'eval_f1': 0.8661616161616161, 'eval_precision': 0.8716645489199492, 'eval_recall': 0.8607277289836889, 'eval_runtime': 11.5644, 'eval_samples_per_second': 160.233, 'eval_steps_per_second': 20.062, 'epoch': 5.0}\n",
|
792 |
+
"{'train_runtime': 3027.4521, 'train_samples_per_second': 47.609, 'train_steps_per_second': 5.952, 'train_loss': 0.15528733040680712, 'epoch': 5.0}\n"
|
793 |
+
]
|
794 |
+
},
|
795 |
+
{
|
796 |
+
"data": {
|
797 |
+
"text/plain": [
|
798 |
+
"TrainOutput(global_step=18020, training_loss=0.15528733040680712, metrics={'train_runtime': 3027.4521, 'train_samples_per_second': 47.609, 'train_steps_per_second': 5.952, 'train_loss': 0.15528733040680712, 'epoch': 5.0})"
|
799 |
+
]
|
800 |
+
},
|
801 |
+
"execution_count": 26,
|
802 |
+
"metadata": {},
|
803 |
+
"output_type": "execute_result"
|
804 |
+
}
|
805 |
+
],
|
806 |
+
"source": [
|
807 |
+
"trainer.train()"
|
808 |
+
]
|
809 |
+
},
|
810 |
+
{
|
811 |
+
"cell_type": "markdown",
|
812 |
+
"metadata": {},
|
813 |
+
"source": [
|
814 |
+
"# best case"
|
815 |
+
]
|
816 |
+
},
|
817 |
+
{
|
818 |
+
"cell_type": "code",
|
819 |
+
"execution_count": 25,
|
820 |
+
"metadata": {},
|
821 |
+
"outputs": [
|
822 |
+
{
|
823 |
+
"name": "stderr",
|
824 |
+
"output_type": "stream",
|
825 |
+
"text": [
|
826 |
+
"huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
|
827 |
+
"To disable this warning, you can either:\n",
|
828 |
+
"\t- Avoid using `tokenizers` before the fork if possible\n",
|
829 |
+
"\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
|
830 |
+
"huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
|
831 |
+
"To disable this warning, you can either:\n",
|
832 |
+
"\t- Avoid using `tokenizers` before the fork if possible\n",
|
833 |
+
"\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
|
834 |
+
"huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
|
835 |
+
"To disable this warning, you can either:\n",
|
836 |
+
"\t- Avoid using `tokenizers` before the fork if possible\n",
|
837 |
+
"\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"
|
838 |
+
]
|
839 |
+
},
|
840 |
+
{
|
841 |
+
"data": {
|
842 |
+
"application/vnd.jupyter.widget-view+json": {
|
843 |
+
"model_id": "4620503cb22c41a582c44a3d17fac2f6",
|
844 |
+
"version_major": 2,
|
845 |
+
"version_minor": 0
|
846 |
+
},
|
847 |
+
"text/plain": [
|
848 |
+
" 0%| | 0/18825 [00:00<?, ?it/s]"
|
849 |
+
]
|
850 |
+
},
|
851 |
+
"metadata": {},
|
852 |
+
"output_type": "display_data"
|
853 |
+
},
|
854 |
+
{
|
855 |
+
"name": "stdout",
|
856 |
+
"output_type": "stream",
|
857 |
+
"text": [
|
858 |
+
"{'loss': 0.4623, 'learning_rate': 4.867197875166003e-05, 'epoch': 0.13}\n",
|
859 |
+
"{'loss': 0.3955, 'learning_rate': 4.734395750332006e-05, 'epoch': 0.27}\n",
|
860 |
+
"{'loss': 0.3695, 'learning_rate': 4.601593625498008e-05, 'epoch': 0.4}\n",
|
861 |
+
"{'loss': 0.368, 'learning_rate': 4.4687915006640105e-05, 'epoch': 0.53}\n",
|
862 |
+
"{'loss': 0.3418, 'learning_rate': 4.335989375830013e-05, 'epoch': 0.66}\n",
|
863 |
+
"{'loss': 0.3519, 'learning_rate': 4.203187250996016e-05, 'epoch': 0.8}\n",
|
864 |
+
"{'loss': 0.3418, 'learning_rate': 4.070385126162019e-05, 'epoch': 0.93}\n"
|
865 |
+
]
|
866 |
+
},
|
867 |
+
{
|
868 |
+
"data": {
|
869 |
+
"application/vnd.jupyter.widget-view+json": {
|
870 |
+
"model_id": "c81779b9a7eb43cfa29966957f13ec31",
|
871 |
+
"version_major": 2,
|
872 |
+
"version_minor": 0
|
873 |
+
},
|
874 |
+
"text/plain": [
|
875 |
+
" 0%| | 0/242 [00:00<?, ?it/s]"
|
876 |
+
]
|
877 |
+
},
|
878 |
+
"metadata": {},
|
879 |
+
"output_type": "display_data"
|
880 |
+
},
|
881 |
+
{
|
882 |
+
"name": "stdout",
|
883 |
+
"output_type": "stream",
|
884 |
+
"text": [
|
885 |
+
"{'eval_loss': 0.2548353374004364, 'eval_accuracy': 0.9013429752066116, 'eval_f1': 0.8737607402511566, 'eval_precision': 0.9218967921896792, 'eval_recall': 0.8304020100502513, 'eval_runtime': 12.1488, 'eval_samples_per_second': 159.357, 'eval_steps_per_second': 19.92, 'epoch': 1.0}\n",
|
886 |
+
"{'loss': 0.2884, 'learning_rate': 3.9375830013280215e-05, 'epoch': 1.06}\n",
|
887 |
+
"{'loss': 0.2136, 'learning_rate': 3.804780876494024e-05, 'epoch': 1.2}\n",
|
888 |
+
"{'loss': 0.2422, 'learning_rate': 3.671978751660027e-05, 'epoch': 1.33}\n",
|
889 |
+
"{'loss': 0.2105, 'learning_rate': 3.53917662682603e-05, 'epoch': 1.46}\n",
|
890 |
+
"{'loss': 0.2203, 'learning_rate': 3.406374501992032e-05, 'epoch': 1.59}\n",
|
891 |
+
"{'loss': 0.2455, 'learning_rate': 3.2735723771580345e-05, 'epoch': 1.73}\n",
|
892 |
+
"{'loss': 0.2282, 'learning_rate': 3.140770252324037e-05, 'epoch': 1.86}\n",
|
893 |
+
"{'loss': 0.2328, 'learning_rate': 3.00796812749004e-05, 'epoch': 1.99}\n"
|
894 |
+
]
|
895 |
+
},
|
896 |
+
{
|
897 |
+
"data": {
|
898 |
+
"application/vnd.jupyter.widget-view+json": {
|
899 |
+
"model_id": "f83c5030d5c34216ba6422f2c22858ba",
|
900 |
+
"version_major": 2,
|
901 |
+
"version_minor": 0
|
902 |
+
},
|
903 |
+
"text/plain": [
|
904 |
+
" 0%| | 0/242 [00:00<?, ?it/s]"
|
905 |
+
]
|
906 |
+
},
|
907 |
+
"metadata": {},
|
908 |
+
"output_type": "display_data"
|
909 |
+
},
|
910 |
+
{
|
911 |
+
"name": "stdout",
|
912 |
+
"output_type": "stream",
|
913 |
+
"text": [
|
914 |
+
"{'eval_loss': 0.4118729829788208, 'eval_accuracy': 0.8982438016528925, 'eval_f1': 0.8763339610797238, 'eval_precision': 0.875784190715182, 'eval_recall': 0.8768844221105527, 'eval_runtime': 12.1691, 'eval_samples_per_second': 159.092, 'eval_steps_per_second': 19.886, 'epoch': 2.0}\n",
|
915 |
+
"{'loss': 0.1086, 'learning_rate': 2.8751660026560427e-05, 'epoch': 2.12}\n",
|
916 |
+
"{'loss': 0.1137, 'learning_rate': 2.742363877822045e-05, 'epoch': 2.26}\n",
|
917 |
+
"{'loss': 0.1058, 'learning_rate': 2.609561752988048e-05, 'epoch': 2.39}\n",
|
918 |
+
"{'loss': 0.1073, 'learning_rate': 2.4767596281540506e-05, 'epoch': 2.52}\n",
|
919 |
+
"{'loss': 0.0953, 'learning_rate': 2.3439575033200534e-05, 'epoch': 2.66}\n",
|
920 |
+
"{'loss': 0.1066, 'learning_rate': 2.2111553784860558e-05, 'epoch': 2.79}\n",
|
921 |
+
"{'loss': 0.1152, 'learning_rate': 2.0783532536520585e-05, 'epoch': 2.92}\n"
|
922 |
+
]
|
923 |
+
},
|
924 |
+
{
|
925 |
+
"data": {
|
926 |
+
"application/vnd.jupyter.widget-view+json": {
|
927 |
+
"model_id": "3c4d464cb3a340d4aa4f6a1a8e4d95b9",
|
928 |
+
"version_major": 2,
|
929 |
+
"version_minor": 0
|
930 |
+
},
|
931 |
+
"text/plain": [
|
932 |
+
" 0%| | 0/242 [00:00<?, ?it/s]"
|
933 |
+
]
|
934 |
+
},
|
935 |
+
"metadata": {},
|
936 |
+
"output_type": "display_data"
|
937 |
+
},
|
938 |
+
{
|
939 |
+
"name": "stdout",
|
940 |
+
"output_type": "stream",
|
941 |
+
"text": [
|
942 |
+
"{'eval_loss': 0.4992543160915375, 'eval_accuracy': 0.9039256198347108, 'eval_f1': 0.8831658291457286, 'eval_precision': 0.8831658291457286, 'eval_recall': 0.8831658291457286, 'eval_runtime': 12.145, 'eval_samples_per_second': 159.407, 'eval_steps_per_second': 19.926, 'epoch': 3.0}\n",
|
943 |
+
"{'loss': 0.0761, 'learning_rate': 1.9455511288180613e-05, 'epoch': 3.05}\n",
|
944 |
+
"{'loss': 0.0434, 'learning_rate': 1.812749003984064e-05, 'epoch': 3.19}\n",
|
945 |
+
"{'loss': 0.0395, 'learning_rate': 1.6799468791500664e-05, 'epoch': 3.32}\n",
|
946 |
+
"{'loss': 0.0516, 'learning_rate': 1.547144754316069e-05, 'epoch': 3.45}\n",
|
947 |
+
"{'loss': 0.0344, 'learning_rate': 1.4143426294820719e-05, 'epoch': 3.59}\n",
|
948 |
+
"{'loss': 0.0588, 'learning_rate': 1.2815405046480745e-05, 'epoch': 3.72}\n",
|
949 |
+
"{'loss': 0.0323, 'learning_rate': 1.148738379814077e-05, 'epoch': 3.85}\n",
|
950 |
+
"{'loss': 0.0574, 'learning_rate': 1.0159362549800798e-05, 'epoch': 3.98}\n"
|
951 |
+
]
|
952 |
+
},
|
953 |
+
{
|
954 |
+
"data": {
|
955 |
+
"application/vnd.jupyter.widget-view+json": {
|
956 |
+
"model_id": "bf0675bd947c472bb221d755dc55a219",
|
957 |
+
"version_major": 2,
|
958 |
+
"version_minor": 0
|
959 |
+
},
|
960 |
+
"text/plain": [
|
961 |
+
" 0%| | 0/242 [00:00<?, ?it/s]"
|
962 |
+
]
|
963 |
+
},
|
964 |
+
"metadata": {},
|
965 |
+
"output_type": "display_data"
|
966 |
+
},
|
967 |
+
{
|
968 |
+
"name": "stdout",
|
969 |
+
"output_type": "stream",
|
970 |
+
"text": [
|
971 |
+
"{'eval_loss': 0.6084339618682861, 'eval_accuracy': 0.9121900826446281, 'eval_f1': 0.8933500627352573, 'eval_precision': 0.8922305764411027, 'eval_recall': 0.8944723618090452, 'eval_runtime': 11.9875, 'eval_samples_per_second': 161.502, 'eval_steps_per_second': 20.188, 'epoch': 4.0}\n",
|
972 |
+
"{'loss': 0.0175, 'learning_rate': 8.831341301460823e-06, 'epoch': 4.12}\n",
|
973 |
+
"{'loss': 0.0248, 'learning_rate': 7.503320053120851e-06, 'epoch': 4.25}\n",
|
974 |
+
"{'loss': 0.0212, 'learning_rate': 6.175298804780877e-06, 'epoch': 4.38}\n",
|
975 |
+
"{'loss': 0.0215, 'learning_rate': 4.847277556440903e-06, 'epoch': 4.52}\n",
|
976 |
+
"{'loss': 0.0216, 'learning_rate': 3.51925630810093e-06, 'epoch': 4.65}\n",
|
977 |
+
"{'loss': 0.0169, 'learning_rate': 2.1912350597609563e-06, 'epoch': 4.78}\n",
|
978 |
+
"{'loss': 0.0199, 'learning_rate': 8.632138114209828e-07, 'epoch': 4.91}\n"
|
979 |
+
]
|
980 |
+
},
|
981 |
+
{
|
982 |
+
"data": {
|
983 |
+
"application/vnd.jupyter.widget-view+json": {
|
984 |
+
"model_id": "0ac0cee28031479d9721321ec9c949a4",
|
985 |
+
"version_major": 2,
|
986 |
+
"version_minor": 0
|
987 |
+
},
|
988 |
+
"text/plain": [
|
989 |
+
" 0%| | 0/242 [00:00<?, ?it/s]"
|
990 |
+
]
|
991 |
+
},
|
992 |
+
"metadata": {},
|
993 |
+
"output_type": "display_data"
|
994 |
+
},
|
995 |
+
{
|
996 |
+
"name": "stdout",
|
997 |
+
"output_type": "stream",
|
998 |
+
"text": [
|
999 |
+
"{'eval_loss': 0.6909418106079102, 'eval_accuracy': 0.9158057851239669, 'eval_f1': 0.8963763509218055, 'eval_precision': 0.9073359073359073, 'eval_recall': 0.885678391959799, 'eval_runtime': 12.1798, 'eval_samples_per_second': 158.952, 'eval_steps_per_second': 19.869, 'epoch': 5.0}\n",
|
1000 |
+
"{'train_runtime': 3197.4084, 'train_samples_per_second': 47.101, 'train_steps_per_second': 5.888, 'train_loss': 0.15457879885892628, 'epoch': 5.0}\n"
|
1001 |
+
]
|
1002 |
+
},
|
1003 |
+
{
|
1004 |
+
"data": {
|
1005 |
+
"text/plain": [
|
1006 |
+
"TrainOutput(global_step=18825, training_loss=0.15457879885892628, metrics={'train_runtime': 3197.4084, 'train_samples_per_second': 47.101, 'train_steps_per_second': 5.888, 'train_loss': 0.15457879885892628, 'epoch': 5.0})"
|
1007 |
+
]
|
1008 |
+
},
|
1009 |
+
"execution_count": 25,
|
1010 |
+
"metadata": {},
|
1011 |
+
"output_type": "execute_result"
|
1012 |
+
}
|
1013 |
+
],
|
1014 |
+
"source": [
|
1015 |
+
"trainer.train()"
|
1016 |
+
]
|
1017 |
+
},
|
1018 |
+
{
|
1019 |
+
"cell_type": "markdown",
|
1020 |
+
"metadata": {},
|
1021 |
+
"source": [
|
1022 |
+
"# load model"
|
1023 |
+
]
|
1024 |
+
},
|
1025 |
+
{
|
1026 |
+
"cell_type": "code",
|
1027 |
+
"execution_count": 20,
|
1028 |
+
"metadata": {},
|
1029 |
+
"outputs": [],
|
1030 |
+
"source": [
|
1031 |
+
"import torch"
|
1032 |
+
]
|
1033 |
+
},
|
1034 |
+
{
|
1035 |
+
"cell_type": "code",
|
1036 |
+
"execution_count": 21,
|
1037 |
+
"metadata": {},
|
1038 |
+
"outputs": [],
|
1039 |
+
"source": [
|
1040 |
+
"model = AutoModelForSequenceClassification.from_pretrained('/DATA/sin-kaf/test_trainer/checkpoint-16000')"
|
1041 |
+
]
|
1042 |
+
},
|
1043 |
+
{
|
1044 |
+
"cell_type": "code",
|
1045 |
+
"execution_count": 30,
|
1046 |
+
"metadata": {},
|
1047 |
+
"outputs": [
|
1048 |
+
{
|
1049 |
+
"name": "stdout",
|
1050 |
+
"output_type": "stream",
|
1051 |
+
"text": [
|
1052 |
+
"Original: güzel kızz\n",
|
1053 |
+
"Token IDs: tensor([[ 2, 2639, 2889, 1050, 3, 0, 0, 0, 0, 0, 0, 0,\n",
|
1054 |
+
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
|
1055 |
+
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
|
1056 |
+
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
|
1057 |
+
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
|
1058 |
+
" 0, 0, 0, 0]])\n",
|
1059 |
+
"Token IDs: tensor([[1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
|
1060 |
+
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
|
1061 |
+
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])\n"
|
1062 |
+
]
|
1063 |
+
}
|
1064 |
+
],
|
1065 |
+
"source": [
|
1066 |
+
"sent = 'güzel kızz'\n",
|
1067 |
+
"input_ids = []\n",
|
1068 |
+
"attention_masks = []\n",
|
1069 |
+
"\n",
|
1070 |
+
"encoded_dict = tokenizer.encode_plus(\n",
|
1071 |
+
" sent,\n",
|
1072 |
+
" add_special_tokens = True,\n",
|
1073 |
+
" max_length = 64,\n",
|
1074 |
+
" pad_to_max_length = True,\n",
|
1075 |
+
" return_attention_mask = True,\n",
|
1076 |
+
" return_tensors = 'pt',\n",
|
1077 |
+
" )\n",
|
1078 |
+
"\n",
|
1079 |
+
"\n",
|
1080 |
+
"input_ids = encoded_dict['input_ids']\n",
|
1081 |
+
"attention_masks = encoded_dict['attention_mask']\n",
|
1082 |
+
"\n",
|
1083 |
+
"\n",
|
1084 |
+
"input_ids = torch.cat([input_ids], dim=0)\n",
|
1085 |
+
"input_mask = torch.cat([attention_masks], dim=0)\n",
|
1086 |
+
"\n",
|
1087 |
+
"\n",
|
1088 |
+
"\n",
|
1089 |
+
"print('Original: ', sent)\n",
|
1090 |
+
"print('Token IDs:', input_ids)\n",
|
1091 |
+
"print('Token IDs:', input_mask)"
|
1092 |
+
]
|
1093 |
+
},
|
1094 |
+
{
|
1095 |
+
"cell_type": "code",
|
1096 |
+
"execution_count": 31,
|
1097 |
+
"metadata": {},
|
1098 |
+
"outputs": [],
|
1099 |
+
"source": [
|
1100 |
+
"outputs = model(input_ids, input_mask)"
|
1101 |
+
]
|
1102 |
+
},
|
1103 |
+
{
|
1104 |
+
"cell_type": "code",
|
1105 |
+
"execution_count": 32,
|
1106 |
+
"metadata": {},
|
1107 |
+
"outputs": [
|
1108 |
+
{
|
1109 |
+
"data": {
|
1110 |
+
"text/plain": [
|
1111 |
+
"SequenceClassifierOutput(loss=None, logits=tensor([[ 3.6835, -3.6147]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)"
|
1112 |
+
]
|
1113 |
+
},
|
1114 |
+
"execution_count": 32,
|
1115 |
+
"metadata": {},
|
1116 |
+
"output_type": "execute_result"
|
1117 |
+
}
|
1118 |
+
],
|
1119 |
+
"source": [
|
1120 |
+
"outputs"
|
1121 |
+
]
|
1122 |
+
},
|
1123 |
+
{
|
1124 |
+
"cell_type": "code",
|
1125 |
+
"execution_count": 33,
|
1126 |
+
"metadata": {},
|
1127 |
+
"outputs": [
|
1128 |
+
{
|
1129 |
+
"data": {
|
1130 |
+
"text/plain": [
|
1131 |
+
"tensor(0)"
|
1132 |
+
]
|
1133 |
+
},
|
1134 |
+
"execution_count": 33,
|
1135 |
+
"metadata": {},
|
1136 |
+
"output_type": "execute_result"
|
1137 |
+
}
|
1138 |
+
],
|
1139 |
+
"source": [
|
1140 |
+
"torch.argmax(outputs['logits'])"
|
1141 |
+
]
|
1142 |
+
}
|
1143 |
+
],
|
1144 |
+
"metadata": {
|
1145 |
+
"kernelspec": {
|
1146 |
+
"display_name": "dlenv",
|
1147 |
+
"language": "python",
|
1148 |
+
"name": "python3"
|
1149 |
+
},
|
1150 |
+
"language_info": {
|
1151 |
+
"codemirror_mode": {
|
1152 |
+
"name": "ipython",
|
1153 |
+
"version": 3
|
1154 |
+
},
|
1155 |
+
"file_extension": ".py",
|
1156 |
+
"mimetype": "text/x-python",
|
1157 |
+
"name": "python",
|
1158 |
+
"nbconvert_exporter": "python",
|
1159 |
+
"pygments_lexer": "ipython3",
|
1160 |
+
"version": "3.9.17"
|
1161 |
+
}
|
1162 |
+
},
|
1163 |
+
"nbformat": 4,
|
1164 |
+
"nbformat_minor": 2
|
1165 |
+
}
|