akoksal commited on
Commit
14640bc
1 Parent(s): e12b791

Upload Training Notebook (Simple NER v2).ipynb

Browse files
Training Notebook (Simple NER v2).ipynb ADDED
@@ -0,0 +1,1374 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "id": "c88f989c",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "import os\n",
11
+ "os.environ['CUDA_VISIBLE_DEVICES']='7'"
12
+ ]
13
+ },
14
+ {
15
+ "cell_type": "code",
16
+ "execution_count": 2,
17
+ "id": "bfdbe247",
18
+ "metadata": {
19
+ "scrolled": true
20
+ },
21
+ "outputs": [
22
+ {
23
+ "name": "stderr",
24
+ "output_type": "stream",
25
+ "text": [
26
+ "2023-02-26 02:35:07.275938: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA\n",
27
+ "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
28
+ "2023-02-26 02:35:07.472394: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory\n",
29
+ "2023-02-26 02:35:07.472434: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.\n",
30
+ "2023-02-26 02:35:07.503598: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
31
+ "2023-02-26 02:35:08.603575: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory\n",
32
+ "2023-02-26 02:35:08.603678: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory\n",
33
+ "2023-02-26 02:35:08.603689: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.\n",
34
+ "2023-02-26 02:35:15.326595: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory\n",
35
+ "2023-02-26 02:35:15.326728: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublas.so.11'; dlerror: libcublas.so.11: cannot open shared object file: No such file or directory\n",
36
+ "2023-02-26 02:35:15.326831: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublasLt.so.11'; dlerror: libcublasLt.so.11: cannot open shared object file: No such file or directory\n",
37
+ "2023-02-26 02:35:15.327013: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcusolver.so.11'; dlerror: libcusolver.so.11: cannot open shared object file: No such file or directory\n",
38
+ "2023-02-26 02:35:15.327108: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcusparse.so.11'; dlerror: libcusparse.so.11: cannot open shared object file: No such file or directory\n",
39
+ "2023-02-26 02:35:15.327205: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudnn.so.8'; dlerror: libcudnn.so.8: cannot open shared object file: No such file or directory\n",
40
+ "2023-02-26 02:35:15.327224: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1934] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.\n",
41
+ "Skipping registering GPU devices...\n"
42
+ ]
43
+ }
44
+ ],
45
+ "source": [
46
+ "from transformers import AutoTokenizer\n",
47
+ "import re\n",
48
+ "import numpy as np\n",
49
+ "from random import Random\n",
50
+ "import torch\n",
51
+ "import pandas as pd\n",
52
+ "import spacy\n",
53
+ "import random\n",
54
+ "from datasets import load_dataset\n",
55
+ "from transformers import (\n",
56
+ " AutoModelForTokenClassification,\n",
57
+ " AutoTokenizer,\n",
58
+ " DataCollatorForTokenClassification,\n",
59
+ " TrainingArguments,\n",
60
+ " Trainer,\n",
61
+ " set_seed)\n",
62
+ "import numpy as np\n",
63
+ "import datasets\n",
64
+ "from collections import defaultdict\n",
65
+ "from datasets import load_metric"
66
+ ]
67
+ },
68
+ {
69
+ "cell_type": "code",
70
+ "execution_count": 3,
71
+ "id": "7a916e9f",
72
+ "metadata": {},
73
+ "outputs": [],
74
+ "source": [
75
+ "# !pip install seqeval"
76
+ ]
77
+ },
78
+ {
79
+ "cell_type": "code",
80
+ "execution_count": 4,
81
+ "id": "4b0590b7",
82
+ "metadata": {},
83
+ "outputs": [],
84
+ "source": [
85
+ "per_device_train_batch_size = 16\n",
86
+ "per_device_eval_batch_size = 32\n",
87
+ "num_train_epochs = 5\n",
88
+ "weight_decay = 0.1\n",
89
+ "warmup_ratio = 0.1\n",
90
+ "learning_rate = 5e-5\n",
91
+ "load_best_model_at_end = True\n",
92
+ "output_dir = \"../akoksal/earthquake_ner_models/\"\n",
93
+ "old_data_path = \"annotated_address_dataset_07022023_766train_192test/\"\n",
94
+ "data_path = \"deprem-private/ner_v12\"\n",
95
+ "cache_dir = \"../akoksal/hf_cache\"\n",
96
+ "saved_models_path = \"../akoksal/earthquake_ner_models/\"\n",
97
+ "device = \"cuda\"\n",
98
+ "seed = 42\n",
99
+ "model_names = [\"dbmdz/bert-base-turkish-cased\",\n",
100
+ " \"dbmdz/electra-base-turkish-mc4-cased-discriminator\",\n",
101
+ " \"dbmdz/bert-base-turkish-128k-cased\",\n",
102
+ " \"dbmdz/convbert-base-turkish-cased\",\n",
103
+ " \"bert-base-multilingual-cased\",\n",
104
+ " \"xlm-roberta-base\"]\n",
105
+ "model_name = model_names[2]"
106
+ ]
107
+ },
108
+ {
109
+ "cell_type": "code",
110
+ "execution_count": 5,
111
+ "id": "9aeb3dbe",
112
+ "metadata": {},
113
+ "outputs": [
114
+ {
115
+ "data": {
116
+ "text/plain": [
117
+ "'dbmdz/bert-base-turkish-128k-cased'"
118
+ ]
119
+ },
120
+ "execution_count": 5,
121
+ "metadata": {},
122
+ "output_type": "execute_result"
123
+ }
124
+ ],
125
+ "source": [
126
+ "model_name"
127
+ ]
128
+ },
129
+ {
130
+ "cell_type": "code",
131
+ "execution_count": 6,
132
+ "id": "ffeb73e4",
133
+ "metadata": {},
134
+ "outputs": [],
135
+ "source": [
136
+ "set_seed(seed)"
137
+ ]
138
+ },
139
+ {
140
+ "cell_type": "code",
141
+ "execution_count": 7,
142
+ "id": "a876c516",
143
+ "metadata": {},
144
+ "outputs": [],
145
+ "source": [
146
+ "id2label = {\n",
147
+ " 0: \"O\",\n",
148
+ " 1: \"B-bina\",\n",
149
+ " 2: \"I-bina\",\n",
150
+ " 3: \"B-bulvar\",\n",
151
+ " 4: \"I-bulvar\",\n",
152
+ " 5: \"B-cadde\",\n",
153
+ " 6: \"I-cadde\",\n",
154
+ " 7: \"B-diskapino\",\n",
155
+ " 8: \"I-diskapino\",\n",
156
+ " 9: \"B-ilce\",\n",
157
+ " 10: \"I-ilce\",\n",
158
+ " 11: \"B-isim\",\n",
159
+ " 12: \"I-isim\",\n",
160
+ " 13: \"B-mahalle\",\n",
161
+ " 14: \"I-mahalle\",\n",
162
+ " 15: \"B-sehir\",\n",
163
+ " 16: \"I-sehir\",\n",
164
+ " 17: \"B-site\",\n",
165
+ " 18: \"I-site\",\n",
166
+ " 19: \"B-sokak\",\n",
167
+ " 20: \"I-sokak\",\n",
168
+ " 21: \"B-soyisim\",\n",
169
+ " 22: \"I-soyisim\",\n",
170
+ " 23: \"B-telefonno\",\n",
171
+ " 24: \"I-telefonno\",\n",
172
+ "}\n",
173
+ "\n",
174
+ "label2id = {label: idx for idx, label in id2label.items()}\n",
175
+ "label_names = list(label2id.keys())"
176
+ ]
177
+ },
178
+ {
179
+ "cell_type": "code",
180
+ "execution_count": 8,
181
+ "id": "2e0caffc",
182
+ "metadata": {},
183
+ "outputs": [],
184
+ "source": [
185
+ "# from huggingface_hub import login\n",
186
+ "# login()"
187
+ ]
188
+ },
189
+ {
190
+ "cell_type": "code",
191
+ "execution_count": 9,
192
+ "id": "c74850f9",
193
+ "metadata": {},
194
+ "outputs": [
195
+ {
196
+ "name": "stderr",
197
+ "output_type": "stream",
198
+ "text": [
199
+ "Some weights of the model checkpoint at dbmdz/bert-base-turkish-128k-cased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']\n",
200
+ "- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
201
+ "- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
202
+ "Some weights of BertForTokenClassification were not initialized from the model checkpoint at dbmdz/bert-base-turkish-128k-cased and are newly initialized: ['classifier.bias', 'classifier.weight']\n",
203
+ "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
204
+ ]
205
+ }
206
+ ],
207
+ "source": [
208
+ "tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
209
+ "model = AutoModelForTokenClassification.from_pretrained(model_name,\n",
210
+ " num_labels=len(label_names),\n",
211
+ " id2label=id2label,\n",
212
+ " cache_dir=cache_dir).to(device)"
213
+ ]
214
+ },
215
+ {
216
+ "cell_type": "code",
217
+ "execution_count": 10,
218
+ "id": "4c1fe653",
219
+ "metadata": {},
220
+ "outputs": [
221
+ {
222
+ "name": "stderr",
223
+ "output_type": "stream",
224
+ "text": [
225
+ "Using custom data configuration deprem-private--ner_v12-e2f61c5a18a7a738\n",
226
+ "Found cached dataset text (/mounts/Users/cisintern/akoksal/.cache/huggingface/datasets/deprem-private___text/deprem-private--ner_v12-e2f61c5a18a7a738/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2)\n"
227
+ ]
228
+ },
229
+ {
230
+ "data": {
231
+ "application/vnd.jupyter.widget-view+json": {
232
+ "model_id": "22bc5f5f97204b41b2bc5dc3b71036e1",
233
+ "version_major": 2,
234
+ "version_minor": 0
235
+ },
236
+ "text/plain": [
237
+ " 0%| | 0/3 [00:00<?, ?it/s]"
238
+ ]
239
+ },
240
+ "metadata": {},
241
+ "output_type": "display_data"
242
+ }
243
+ ],
244
+ "source": [
245
+ "raw_dataset = datasets.load_dataset(\"deprem-private/ner_v12\", use_auth_token=True)\n",
246
+ "\n",
247
+ "new_dataset_json = {}\n",
248
+ "for split in [\"train\", \"validation\", \"test\"]:\n",
249
+ " ids = []\n",
250
+ " sentences = []\n",
251
+ " labels = []\n",
252
+ " ids = []\n",
253
+ " cur_idx = 0\n",
254
+ " unique_labels = set()\n",
255
+ " temp_sent = []\n",
256
+ " temp_labels = []\n",
257
+ " for word in raw_dataset[split][\"text\"]:\n",
258
+ " \n",
259
+ " if word!=\"\":\n",
260
+ " temp_sent.append((word.split()[0]))\n",
261
+ " temp_labels.append(label2id[(word.split()[1])])\n",
262
+ " else:\n",
263
+ " sentences.append(temp_sent)\n",
264
+ " labels.append(temp_labels)\n",
265
+ " ids.append(cur_idx)\n",
266
+ " cur_idx+=1\n",
267
+ " temp_sent = []\n",
268
+ " temp_labels = []\n",
269
+ " new_dataset_json[split] = {\"tokens\":sentences, \"ner_tags\":labels, \"ids\":ids}\n",
270
+ "\n",
271
+ "dataset = datasets.DatasetDict()\n",
272
+ "# using your `Dict` object\n",
273
+ "for k,v in new_dataset_json.items():\n",
274
+ " dataset[k] = datasets.Dataset.from_dict(v)"
275
+ ]
276
+ },
277
+ {
278
+ "cell_type": "code",
279
+ "execution_count": 11,
280
+ "id": "65a66af9",
281
+ "metadata": {},
282
+ "outputs": [
283
+ {
284
+ "data": {
285
+ "application/vnd.jupyter.widget-view+json": {
286
+ "model_id": "a403f5fadb3041f4b18acc7ec41a2d36",
287
+ "version_major": 2,
288
+ "version_minor": 0
289
+ },
290
+ "text/plain": [
291
+ " 0%| | 0/1 [00:00<?, ?ba/s]"
292
+ ]
293
+ },
294
+ "metadata": {},
295
+ "output_type": "display_data"
296
+ },
297
+ {
298
+ "data": {
299
+ "application/vnd.jupyter.widget-view+json": {
300
+ "model_id": "e2410f6106514cfd8207d8b42748c66d",
301
+ "version_major": 2,
302
+ "version_minor": 0
303
+ },
304
+ "text/plain": [
305
+ " 0%| | 0/1 [00:00<?, ?ba/s]"
306
+ ]
307
+ },
308
+ "metadata": {},
309
+ "output_type": "display_data"
310
+ },
311
+ {
312
+ "data": {
313
+ "application/vnd.jupyter.widget-view+json": {
314
+ "model_id": "227e163e07b2414da9abdbe11cb0c6bf",
315
+ "version_major": 2,
316
+ "version_minor": 0
317
+ },
318
+ "text/plain": [
319
+ " 0%| | 0/1 [00:00<?, ?ba/s]"
320
+ ]
321
+ },
322
+ "metadata": {},
323
+ "output_type": "display_data"
324
+ }
325
+ ],
326
+ "source": [
327
+ "# dataset = datasets.load_from_disk(old_data_path)\n",
328
+ "def tokenize_and_align_labels(examples):\n",
329
+ " tokenized_inputs = tokenizer(examples[\"tokens\"], truncation=True, is_split_into_words=True)\n",
330
+ "\n",
331
+ " labels = []\n",
332
+ " for i, label in enumerate(examples[f\"ner_tags\"]):\n",
333
+ " word_ids = tokenized_inputs.word_ids(batch_index=i) # Map tokens to their respective word.\n",
334
+ " previous_word_idx = None\n",
335
+ " label_ids = []\n",
336
+ " for word_idx in word_ids: # Set the special tokens to -100.\n",
337
+ " if word_idx is None:\n",
338
+ " label_ids.append(-100)\n",
339
+ " elif word_idx != previous_word_idx: # Only label the first token of a given word.\n",
340
+ " label_ids.append(label[word_idx])\n",
341
+ " else:\n",
342
+ " label_ids.append(-100)\n",
343
+ " previous_word_idx = word_idx\n",
344
+ " labels.append(label_ids)\n",
345
+ "\n",
346
+ " tokenized_inputs[\"labels\"] = labels\n",
347
+ " return tokenized_inputs\n",
348
+ "\n",
349
+ "tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)"
350
+ ]
351
+ },
352
+ {
353
+ "cell_type": "code",
354
+ "execution_count": 12,
355
+ "id": "6b43934d",
356
+ "metadata": {},
357
+ "outputs": [],
358
+ "source": [
359
+ "data_collator = DataCollatorForTokenClassification(tokenizer)"
360
+ ]
361
+ },
362
+ {
363
+ "cell_type": "code",
364
+ "execution_count": 13,
365
+ "id": "c24f52db",
366
+ "metadata": {},
367
+ "outputs": [
368
+ {
369
+ "name": "stderr",
370
+ "output_type": "stream",
371
+ "text": [
372
+ "/tmp/ipykernel_2652487/885599324.py:1: FutureWarning: load_metric is deprecated and will be removed in the next major version of datasets. Use 'evaluate.load' instead, from the new library 🤗 Evaluate: https://huggingface.co/docs/evaluate\n",
373
+ " metric = load_metric(\"seqeval\")\n"
374
+ ]
375
+ }
376
+ ],
377
+ "source": [
378
+ "metric = load_metric(\"seqeval\")\n",
379
+ "def compute_metrics(p):\n",
380
+ " predictions, labels = p\n",
381
+ " predictions = np.argmax(predictions, axis=2)\n",
382
+ "\n",
383
+ " # Remove ignored index (special tokens)\n",
384
+ " true_predictions = [\n",
385
+ " [label_names[p] for (p, l) in zip(prediction, label) if l != -100]\n",
386
+ " for prediction, label in zip(predictions, labels)\n",
387
+ " ]\n",
388
+ " true_labels = [\n",
389
+ " [label_names[l] for (p, l) in zip(prediction, label) if l != -100]\n",
390
+ " for prediction, label in zip(predictions, labels)\n",
391
+ " ]\n",
392
+ "\n",
393
+ " results = metric.compute(predictions=true_predictions, references=true_labels)\n",
394
+ " flattened_results = {\n",
395
+ " \"overall_precision\": results[\"overall_precision\"],\n",
396
+ " \"overall_recall\": results[\"overall_recall\"],\n",
397
+ " \"overall_f1\": results[\"overall_f1\"],\n",
398
+ " \"overall_accuracy\": results[\"overall_accuracy\"],\n",
399
+ " }\n",
400
+ " for k in results.keys():\n",
401
+ " if(k not in flattened_results.keys()):\n",
402
+ " flattened_results[k+\"_f1\"]=results[k][\"f1\"]\n",
403
+ " flattened_results[k+\"_recall\"]=results[k][\"recall\"]\n",
404
+ " flattened_results[k+\"_precision\"]=results[k][\"precision\"]\n",
405
+ " flattened_results[k+\"_support\"]=results[k][\"number\"]\n",
406
+ "\n",
407
+ " return flattened_results"
408
+ ]
409
+ },
410
+ {
411
+ "cell_type": "code",
412
+ "execution_count": 14,
413
+ "id": "a955fd51",
414
+ "metadata": {},
415
+ "outputs": [],
416
+ "source": [
417
+ "training_args = TrainingArguments(\n",
418
+ " output_dir=saved_models_path,\n",
419
+ " evaluation_strategy=\"epoch\",\n",
420
+ " learning_rate=learning_rate,\n",
421
+ " per_device_train_batch_size=per_device_train_batch_size,\n",
422
+ " per_device_eval_batch_size=per_device_eval_batch_size,\n",
423
+ " num_train_epochs=num_train_epochs,\n",
424
+ " warmup_ratio=warmup_ratio,\n",
425
+ " weight_decay=weight_decay,\n",
426
+ " run_name = \"turkish_ner\",\n",
427
+ " save_strategy='epoch',\n",
428
+ " logging_strategy=\"epoch\",\n",
429
+ " save_total_limit=3,\n",
430
+ " load_best_model_at_end=load_best_model_at_end,\n",
431
+ " \n",
432
+ ")\n",
433
+ "trainer = Trainer(\n",
434
+ " model=model,\n",
435
+ " args=training_args,\n",
436
+ " train_dataset=tokenized_dataset[\"train\"],\n",
437
+ " eval_dataset=tokenized_dataset[\"validation\"],\n",
438
+ " data_collator=data_collator,\n",
439
+ " tokenizer=tokenizer,\n",
440
+ " compute_metrics=compute_metrics\n",
441
+ ")"
442
+ ]
443
+ },
444
+ {
445
+ "cell_type": "code",
446
+ "execution_count": 15,
447
+ "id": "9f78efdc",
448
+ "metadata": {},
449
+ "outputs": [
450
+ {
451
+ "name": "stderr",
452
+ "output_type": "stream",
453
+ "text": [
454
+ "The following columns in the training set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens, ids, ner_tags. If tokens, ids, ner_tags are not expected by `BertForTokenClassification.forward`, you can safely ignore this message.\n",
455
+ "/mounts/work/akoksal/anaconda3/envs/lmbias/lib/python3.9/site-packages/transformers/optimization.py:306: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n",
456
+ " warnings.warn(\n",
457
+ "***** Running training *****\n",
458
+ " Num examples = 799\n",
459
+ " Num Epochs = 5\n",
460
+ " Instantaneous batch size per device = 16\n",
461
+ " Total train batch size (w. parallel, distributed & accumulation) = 16\n",
462
+ " Gradient Accumulation steps = 1\n",
463
+ " Total optimization steps = 250\n",
464
+ " Number of trainable parameters = 183773977\n",
465
+ "You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n"
466
+ ]
467
+ },
468
+ {
469
+ "data": {
470
+ "text/html": [
471
+ "\n",
472
+ " <div>\n",
473
+ " \n",
474
+ " <progress value='250' max='250' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
475
+ " [250/250 01:12, Epoch 5/5]\n",
476
+ " </div>\n",
477
+ " <table border=\"1\" class=\"dataframe\">\n",
478
+ " <thead>\n",
479
+ " <tr style=\"text-align: left;\">\n",
480
+ " <th>Epoch</th>\n",
481
+ " <th>Training Loss</th>\n",
482
+ " <th>Validation Loss</th>\n",
483
+ " <th>Overall Precision</th>\n",
484
+ " <th>Overall Recall</th>\n",
485
+ " <th>Overall F1</th>\n",
486
+ " <th>Overall Accuracy</th>\n",
487
+ " <th>Bina F1</th>\n",
488
+ " <th>Bina Recall</th>\n",
489
+ " <th>Bina Precision</th>\n",
490
+ " <th>Bina Support</th>\n",
491
+ " <th>Bulvar F1</th>\n",
492
+ " <th>Bulvar Recall</th>\n",
493
+ " <th>Bulvar Precision</th>\n",
494
+ " <th>Bulvar Support</th>\n",
495
+ " <th>Cadde F1</th>\n",
496
+ " <th>Cadde Recall</th>\n",
497
+ " <th>Cadde Precision</th>\n",
498
+ " <th>Cadde Support</th>\n",
499
+ " <th>Diskapino F1</th>\n",
500
+ " <th>Diskapino Recall</th>\n",
501
+ " <th>Diskapino Precision</th>\n",
502
+ " <th>Diskapino Support</th>\n",
503
+ " <th>Ilce F1</th>\n",
504
+ " <th>Ilce Recall</th>\n",
505
+ " <th>Ilce Precision</th>\n",
506
+ " <th>Ilce Support</th>\n",
507
+ " <th>Isim F1</th>\n",
508
+ " <th>Isim Recall</th>\n",
509
+ " <th>Isim Precision</th>\n",
510
+ " <th>Isim Support</th>\n",
511
+ " <th>Mahalle F1</th>\n",
512
+ " <th>Mahalle Recall</th>\n",
513
+ " <th>Mahalle Precision</th>\n",
514
+ " <th>Mahalle Support</th>\n",
515
+ " <th>Sehir F1</th>\n",
516
+ " <th>Sehir Recall</th>\n",
517
+ " <th>Sehir Precision</th>\n",
518
+ " <th>Sehir Support</th>\n",
519
+ " <th>Site F1</th>\n",
520
+ " <th>Site Recall</th>\n",
521
+ " <th>Site Precision</th>\n",
522
+ " <th>Site Support</th>\n",
523
+ " <th>Sokak F1</th>\n",
524
+ " <th>Sokak Recall</th>\n",
525
+ " <th>Sokak Precision</th>\n",
526
+ " <th>Sokak Support</th>\n",
527
+ " <th>Soyisim F1</th>\n",
528
+ " <th>Soyisim Recall</th>\n",
529
+ " <th>Soyisim Precision</th>\n",
530
+ " <th>Soyisim Support</th>\n",
531
+ " <th>Telefonno F1</th>\n",
532
+ " <th>Telefonno Recall</th>\n",
533
+ " <th>Telefonno Precision</th>\n",
534
+ " <th>Telefonno Support</th>\n",
535
+ " </tr>\n",
536
+ " </thead>\n",
537
+ " <tbody>\n",
538
+ " <tr>\n",
539
+ " <td>1</td>\n",
540
+ " <td>1.349500</td>\n",
541
+ " <td>0.357321</td>\n",
542
+ " <td>0.783270</td>\n",
543
+ " <td>0.828974</td>\n",
544
+ " <td>0.805474</td>\n",
545
+ " <td>0.908936</td>\n",
546
+ " <td>0.600000</td>\n",
547
+ " <td>0.705882</td>\n",
548
+ " <td>0.521739</td>\n",
549
+ " <td>34</td>\n",
550
+ " <td>0.000000</td>\n",
551
+ " <td>0.000000</td>\n",
552
+ " <td>0.000000</td>\n",
553
+ " <td>5</td>\n",
554
+ " <td>0.588235</td>\n",
555
+ " <td>0.833333</td>\n",
556
+ " <td>0.454545</td>\n",
557
+ " <td>24</td>\n",
558
+ " <td>0.769231</td>\n",
559
+ " <td>0.892857</td>\n",
560
+ " <td>0.675676</td>\n",
561
+ " <td>28</td>\n",
562
+ " <td>0.830508</td>\n",
563
+ " <td>0.816667</td>\n",
564
+ " <td>0.844828</td>\n",
565
+ " <td>60</td>\n",
566
+ " <td>0.888889</td>\n",
567
+ " <td>0.926829</td>\n",
568
+ " <td>0.853933</td>\n",
569
+ " <td>82</td>\n",
570
+ " <td>0.750000</td>\n",
571
+ " <td>0.792453</td>\n",
572
+ " <td>0.711864</td>\n",
573
+ " <td>53</td>\n",
574
+ " <td>0.867133</td>\n",
575
+ " <td>0.861111</td>\n",
576
+ " <td>0.873239</td>\n",
577
+ " <td>72</td>\n",
578
+ " <td>0.000000</td>\n",
579
+ " <td>0.000000</td>\n",
580
+ " <td>0.000000</td>\n",
581
+ " <td>6</td>\n",
582
+ " <td>0.750000</td>\n",
583
+ " <td>0.620690</td>\n",
584
+ " <td>0.947368</td>\n",
585
+ " <td>29</td>\n",
586
+ " <td>0.900000</td>\n",
587
+ " <td>0.887324</td>\n",
588
+ " <td>0.913043</td>\n",
589
+ " <td>71</td>\n",
590
+ " <td>0.985075</td>\n",
591
+ " <td>1.000000</td>\n",
592
+ " <td>0.970588</td>\n",
593
+ " <td>33</td>\n",
594
+ " </tr>\n",
595
+ " <tr>\n",
596
+ " <td>2</td>\n",
597
+ " <td>0.264700</td>\n",
598
+ " <td>0.220467</td>\n",
599
+ " <td>0.885149</td>\n",
600
+ " <td>0.899396</td>\n",
601
+ " <td>0.892216</td>\n",
602
+ " <td>0.944792</td>\n",
603
+ " <td>0.782609</td>\n",
604
+ " <td>0.794118</td>\n",
605
+ " <td>0.771429</td>\n",
606
+ " <td>34</td>\n",
607
+ " <td>0.666667</td>\n",
608
+ " <td>0.800000</td>\n",
609
+ " <td>0.571429</td>\n",
610
+ " <td>5</td>\n",
611
+ " <td>0.875000</td>\n",
612
+ " <td>0.875000</td>\n",
613
+ " <td>0.875000</td>\n",
614
+ " <td>24</td>\n",
615
+ " <td>0.862069</td>\n",
616
+ " <td>0.892857</td>\n",
617
+ " <td>0.833333</td>\n",
618
+ " <td>28</td>\n",
619
+ " <td>0.894309</td>\n",
620
+ " <td>0.916667</td>\n",
621
+ " <td>0.873016</td>\n",
622
+ " <td>60</td>\n",
623
+ " <td>0.884848</td>\n",
624
+ " <td>0.890244</td>\n",
625
+ " <td>0.879518</td>\n",
626
+ " <td>82</td>\n",
627
+ " <td>0.897196</td>\n",
628
+ " <td>0.905660</td>\n",
629
+ " <td>0.888889</td>\n",
630
+ " <td>53</td>\n",
631
+ " <td>0.915493</td>\n",
632
+ " <td>0.902778</td>\n",
633
+ " <td>0.928571</td>\n",
634
+ " <td>72</td>\n",
635
+ " <td>0.181818</td>\n",
636
+ " <td>0.166667</td>\n",
637
+ " <td>0.200000</td>\n",
638
+ " <td>6</td>\n",
639
+ " <td>0.949153</td>\n",
640
+ " <td>0.965517</td>\n",
641
+ " <td>0.933333</td>\n",
642
+ " <td>29</td>\n",
643
+ " <td>0.950355</td>\n",
644
+ " <td>0.943662</td>\n",
645
+ " <td>0.957143</td>\n",
646
+ " <td>71</td>\n",
647
+ " <td>0.985075</td>\n",
648
+ " <td>1.000000</td>\n",
649
+ " <td>0.970588</td>\n",
650
+ " <td>33</td>\n",
651
+ " </tr>\n",
652
+ " <tr>\n",
653
+ " <td>3</td>\n",
654
+ " <td>0.158700</td>\n",
655
+ " <td>0.219565</td>\n",
656
+ " <td>0.876768</td>\n",
657
+ " <td>0.873239</td>\n",
658
+ " <td>0.875000</td>\n",
659
+ " <td>0.940808</td>\n",
660
+ " <td>0.805556</td>\n",
661
+ " <td>0.852941</td>\n",
662
+ " <td>0.763158</td>\n",
663
+ " <td>34</td>\n",
664
+ " <td>0.666667</td>\n",
665
+ " <td>1.000000</td>\n",
666
+ " <td>0.500000</td>\n",
667
+ " <td>5</td>\n",
668
+ " <td>0.880000</td>\n",
669
+ " <td>0.916667</td>\n",
670
+ " <td>0.846154</td>\n",
671
+ " <td>24</td>\n",
672
+ " <td>0.827586</td>\n",
673
+ " <td>0.857143</td>\n",
674
+ " <td>0.800000</td>\n",
675
+ " <td>28</td>\n",
676
+ " <td>0.881356</td>\n",
677
+ " <td>0.866667</td>\n",
678
+ " <td>0.896552</td>\n",
679
+ " <td>60</td>\n",
680
+ " <td>0.822785</td>\n",
681
+ " <td>0.792683</td>\n",
682
+ " <td>0.855263</td>\n",
683
+ " <td>82</td>\n",
684
+ " <td>0.886792</td>\n",
685
+ " <td>0.886792</td>\n",
686
+ " <td>0.886792</td>\n",
687
+ " <td>53</td>\n",
688
+ " <td>0.892086</td>\n",
689
+ " <td>0.861111</td>\n",
690
+ " <td>0.925373</td>\n",
691
+ " <td>72</td>\n",
692
+ " <td>0.400000</td>\n",
693
+ " <td>0.333333</td>\n",
694
+ " <td>0.500000</td>\n",
695
+ " <td>6</td>\n",
696
+ " <td>0.881356</td>\n",
697
+ " <td>0.896552</td>\n",
698
+ " <td>0.866667</td>\n",
699
+ " <td>29</td>\n",
700
+ " <td>0.957143</td>\n",
701
+ " <td>0.943662</td>\n",
702
+ " <td>0.971014</td>\n",
703
+ " <td>71</td>\n",
704
+ " <td>0.985075</td>\n",
705
+ " <td>1.000000</td>\n",
706
+ " <td>0.970588</td>\n",
707
+ " <td>33</td>\n",
708
+ " </tr>\n",
709
+ " <tr>\n",
710
+ " <td>4</td>\n",
711
+ " <td>0.115000</td>\n",
712
+ " <td>0.215329</td>\n",
713
+ " <td>0.897541</td>\n",
714
+ " <td>0.881288</td>\n",
715
+ " <td>0.889340</td>\n",
716
+ " <td>0.946500</td>\n",
717
+ " <td>0.857143</td>\n",
718
+ " <td>0.882353</td>\n",
719
+ " <td>0.833333</td>\n",
720
+ " <td>34</td>\n",
721
+ " <td>0.909091</td>\n",
722
+ " <td>1.000000</td>\n",
723
+ " <td>0.833333</td>\n",
724
+ " <td>5</td>\n",
725
+ " <td>0.897959</td>\n",
726
+ " <td>0.916667</td>\n",
727
+ " <td>0.880000</td>\n",
728
+ " <td>24</td>\n",
729
+ " <td>0.862069</td>\n",
730
+ " <td>0.892857</td>\n",
731
+ " <td>0.833333</td>\n",
732
+ " <td>28</td>\n",
733
+ " <td>0.881356</td>\n",
734
+ " <td>0.866667</td>\n",
735
+ " <td>0.896552</td>\n",
736
+ " <td>60</td>\n",
737
+ " <td>0.810127</td>\n",
738
+ " <td>0.780488</td>\n",
739
+ " <td>0.842105</td>\n",
740
+ " <td>82</td>\n",
741
+ " <td>0.886792</td>\n",
742
+ " <td>0.886792</td>\n",
743
+ " <td>0.886792</td>\n",
744
+ " <td>53</td>\n",
745
+ " <td>0.890511</td>\n",
746
+ " <td>0.847222</td>\n",
747
+ " <td>0.938462</td>\n",
748
+ " <td>72</td>\n",
749
+ " <td>0.727273</td>\n",
750
+ " <td>0.666667</td>\n",
751
+ " <td>0.800000</td>\n",
752
+ " <td>6</td>\n",
753
+ " <td>0.950820</td>\n",
754
+ " <td>1.000000</td>\n",
755
+ " <td>0.906250</td>\n",
756
+ " <td>29</td>\n",
757
+ " <td>0.949640</td>\n",
758
+ " <td>0.929577</td>\n",
759
+ " <td>0.970588</td>\n",
760
+ " <td>71</td>\n",
761
+ " <td>0.985075</td>\n",
762
+ " <td>1.000000</td>\n",
763
+ " <td>0.970588</td>\n",
764
+ " <td>33</td>\n",
765
+ " </tr>\n",
766
+ " <tr>\n",
767
+ " <td>5</td>\n",
768
+ " <td>0.093800</td>\n",
769
+ " <td>0.231558</td>\n",
770
+ " <td>0.895492</td>\n",
771
+ " <td>0.879276</td>\n",
772
+ " <td>0.887310</td>\n",
773
+ " <td>0.945361</td>\n",
774
+ " <td>0.833333</td>\n",
775
+ " <td>0.882353</td>\n",
776
+ " <td>0.789474</td>\n",
777
+ " <td>34</td>\n",
778
+ " <td>0.909091</td>\n",
779
+ " <td>1.000000</td>\n",
780
+ " <td>0.833333</td>\n",
781
+ " <td>5</td>\n",
782
+ " <td>0.880000</td>\n",
783
+ " <td>0.916667</td>\n",
784
+ " <td>0.846154</td>\n",
785
+ " <td>24</td>\n",
786
+ " <td>0.813559</td>\n",
787
+ " <td>0.857143</td>\n",
788
+ " <td>0.774194</td>\n",
789
+ " <td>28</td>\n",
790
+ " <td>0.888889</td>\n",
791
+ " <td>0.866667</td>\n",
792
+ " <td>0.912281</td>\n",
793
+ " <td>60</td>\n",
794
+ " <td>0.833333</td>\n",
795
+ " <td>0.792683</td>\n",
796
+ " <td>0.878378</td>\n",
797
+ " <td>82</td>\n",
798
+ " <td>0.895238</td>\n",
799
+ " <td>0.886792</td>\n",
800
+ " <td>0.903846</td>\n",
801
+ " <td>53</td>\n",
802
+ " <td>0.898551</td>\n",
803
+ " <td>0.861111</td>\n",
804
+ " <td>0.939394</td>\n",
805
+ " <td>72</td>\n",
806
+ " <td>0.727273</td>\n",
807
+ " <td>0.666667</td>\n",
808
+ " <td>0.800000</td>\n",
809
+ " <td>6</td>\n",
810
+ " <td>0.881356</td>\n",
811
+ " <td>0.896552</td>\n",
812
+ " <td>0.866667</td>\n",
813
+ " <td>29</td>\n",
814
+ " <td>0.957143</td>\n",
815
+ " <td>0.943662</td>\n",
816
+ " <td>0.971014</td>\n",
817
+ " <td>71</td>\n",
818
+ " <td>0.985075</td>\n",
819
+ " <td>1.000000</td>\n",
820
+ " <td>0.970588</td>\n",
821
+ " <td>33</td>\n",
822
+ " </tr>\n",
823
+ " </tbody>\n",
824
+ "</table><p>"
825
+ ],
826
+ "text/plain": [
827
+ "<IPython.core.display.HTML object>"
828
+ ]
829
+ },
830
+ "metadata": {},
831
+ "output_type": "display_data"
832
+ },
833
+ {
834
+ "name": "stderr",
835
+ "output_type": "stream",
836
+ "text": [
837
+ "The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens, ids, ner_tags. If tokens, ids, ner_tags are not expected by `BertForTokenClassification.forward`, you can safely ignore this message.\n",
838
+ "***** Running Evaluation *****\n",
839
+ " Num examples = 58\n",
840
+ " Batch size = 32\n",
841
+ "/mounts/work/akoksal/anaconda3/envs/lmbias/lib/python3.9/site-packages/seqeval/metrics/v1.py:57: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
842
+ " _warn_prf(average, modifier, msg_start, len(result))\n",
843
+ "Saving model checkpoint to /mounts/work/akoksal/earthquake_ner_models/checkpoint-50\n",
844
+ "Configuration saved in /mounts/work/akoksal/earthquake_ner_models/checkpoint-50/config.json\n",
845
+ "Model weights saved in /mounts/work/akoksal/earthquake_ner_models/checkpoint-50/pytorch_model.bin\n",
846
+ "tokenizer config file saved in /mounts/work/akoksal/earthquake_ner_models/checkpoint-50/tokenizer_config.json\n",
847
+ "Special tokens file saved in /mounts/work/akoksal/earthquake_ner_models/checkpoint-50/special_tokens_map.json\n",
848
+ "The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens, ids, ner_tags. If tokens, ids, ner_tags are not expected by `BertForTokenClassification.forward`, you can safely ignore this message.\n",
849
+ "***** Running Evaluation *****\n",
850
+ " Num examples = 58\n",
851
+ " Batch size = 32\n",
852
+ "Saving model checkpoint to /mounts/work/akoksal/earthquake_ner_models/checkpoint-100\n",
853
+ "Configuration saved in /mounts/work/akoksal/earthquake_ner_models/checkpoint-100/config.json\n",
854
+ "Model weights saved in /mounts/work/akoksal/earthquake_ner_models/checkpoint-100/pytorch_model.bin\n",
855
+ "tokenizer config file saved in /mounts/work/akoksal/earthquake_ner_models/checkpoint-100/tokenizer_config.json\n",
856
+ "Special tokens file saved in /mounts/work/akoksal/earthquake_ner_models/checkpoint-100/special_tokens_map.json\n",
857
+ "The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens, ids, ner_tags. If tokens, ids, ner_tags are not expected by `BertForTokenClassification.forward`, you can safely ignore this message.\n",
858
+ "***** Running Evaluation *****\n",
859
+ " Num examples = 58\n",
860
+ " Batch size = 32\n",
861
+ "Saving model checkpoint to /mounts/work/akoksal/earthquake_ner_models/checkpoint-150\n",
862
+ "Configuration saved in /mounts/work/akoksal/earthquake_ner_models/checkpoint-150/config.json\n",
863
+ "Model weights saved in /mounts/work/akoksal/earthquake_ner_models/checkpoint-150/pytorch_model.bin\n",
864
+ "tokenizer config file saved in /mounts/work/akoksal/earthquake_ner_models/checkpoint-150/tokenizer_config.json\n",
865
+ "Special tokens file saved in /mounts/work/akoksal/earthquake_ner_models/checkpoint-150/special_tokens_map.json\n",
866
+ "The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens, ids, ner_tags. If tokens, ids, ner_tags are not expected by `BertForTokenClassification.forward`, you can safely ignore this message.\n",
867
+ "***** Running Evaluation *****\n",
868
+ " Num examples = 58\n",
869
+ " Batch size = 32\n",
870
+ "Saving model checkpoint to /mounts/work/akoksal/earthquake_ner_models/checkpoint-200\n",
871
+ "Configuration saved in /mounts/work/akoksal/earthquake_ner_models/checkpoint-200/config.json\n",
872
+ "Model weights saved in /mounts/work/akoksal/earthquake_ner_models/checkpoint-200/pytorch_model.bin\n",
873
+ "tokenizer config file saved in /mounts/work/akoksal/earthquake_ner_models/checkpoint-200/tokenizer_config.json\n",
874
+ "Special tokens file saved in /mounts/work/akoksal/earthquake_ner_models/checkpoint-200/special_tokens_map.json\n",
875
+ "Deleting older checkpoint [/mounts/work/akoksal/earthquake_ner_models/checkpoint-50] due to args.save_total_limit\n",
876
+ "The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens, ids, ner_tags. If tokens, ids, ner_tags are not expected by `BertForTokenClassification.forward`, you can safely ignore this message.\n",
877
+ "***** Running Evaluation *****\n",
878
+ " Num examples = 58\n",
879
+ " Batch size = 32\n",
880
+ "Saving model checkpoint to /mounts/work/akoksal/earthquake_ner_models/checkpoint-250\n",
881
+ "Configuration saved in /mounts/work/akoksal/earthquake_ner_models/checkpoint-250/config.json\n",
882
+ "Model weights saved in /mounts/work/akoksal/earthquake_ner_models/checkpoint-250/pytorch_model.bin\n",
883
+ "tokenizer config file saved in /mounts/work/akoksal/earthquake_ner_models/checkpoint-250/tokenizer_config.json\n",
884
+ "Special tokens file saved in /mounts/work/akoksal/earthquake_ner_models/checkpoint-250/special_tokens_map.json\n",
885
+ "Deleting older checkpoint [/mounts/work/akoksal/earthquake_ner_models/checkpoint-100] due to args.save_total_limit\n",
886
+ "\n",
887
+ "\n",
888
+ "Training completed. Do not forget to share your model on huggingface.co/models =)\n",
889
+ "\n",
890
+ "\n",
891
+ "Loading best model from /mounts/work/akoksal/earthquake_ner_models/checkpoint-200 (score: 0.21532948315143585).\n"
892
+ ]
893
+ },
894
+ {
895
+ "data": {
896
+ "text/plain": [
897
+ "TrainOutput(global_step=250, training_loss=0.3963502960205078, metrics={'train_runtime': 73.0701, 'train_samples_per_second': 54.674, 'train_steps_per_second': 3.421, 'total_flos': 129863927953500.0, 'train_loss': 0.3963502960205078, 'epoch': 5.0})"
898
+ ]
899
+ },
900
+ "execution_count": 15,
901
+ "metadata": {},
902
+ "output_type": "execute_result"
903
+ }
904
+ ],
905
+ "source": [
906
+ "trainer.train()"
907
+ ]
908
+ },
909
+ {
910
+ "cell_type": "code",
911
+ "execution_count": 16,
912
+ "id": "4427c32d",
913
+ "metadata": {},
914
+ "outputs": [
915
+ {
916
+ "name": "stderr",
917
+ "output_type": "stream",
918
+ "text": [
919
+ "The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens, ids, ner_tags. If tokens, ids, ner_tags are not expected by `BertForTokenClassification.forward`, you can safely ignore this message.\n",
920
+ "***** Running Evaluation *****\n",
921
+ " Num examples = 129\n",
922
+ " Batch size = 32\n"
923
+ ]
924
+ },
925
+ {
926
+ "data": {
927
+ "text/html": [
928
+ "\n",
929
+ " <div>\n",
930
+ " \n",
931
+ " <progress value='5' max='5' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
932
+ " [5/5 00:00]\n",
933
+ " </div>\n",
934
+ " "
935
+ ],
936
+ "text/plain": [
937
+ "<IPython.core.display.HTML object>"
938
+ ]
939
+ },
940
+ "metadata": {},
941
+ "output_type": "display_data"
942
+ }
943
+ ],
944
+ "source": [
945
+ "results = trainer.evaluate(tokenized_dataset[\"test\"])"
946
+ ]
947
+ },
948
+ {
949
+ "cell_type": "code",
950
+ "execution_count": 24,
951
+ "id": "aabbb977",
952
+ "metadata": {},
953
+ "outputs": [
954
+ {
955
+ "data": {
956
+ "text/plain": [
957
+ "{'eval_loss': 0.24822480976581573,\n",
958
+ " 'eval_overall_precision': 0.8442211055276382,\n",
959
+ " 'eval_overall_recall': 0.877742946708464,\n",
960
+ " 'eval_overall_f1': 0.860655737704918,\n",
961
+ " 'eval_overall_accuracy': 0.9401853411962932,\n",
962
+ " 'eval_bina_f1': 0.7000000000000001,\n",
963
+ " 'eval_bina_recall': 0.7424242424242424,\n",
964
+ " 'eval_bina_precision': 0.6621621621621622,\n",
965
+ " 'eval_bina_support': 66,\n",
966
+ " 'eval_bulvar_f1': 0.9230769230769231,\n",
967
+ " 'eval_bulvar_recall': 0.9230769230769231,\n",
968
+ " 'eval_bulvar_precision': 0.9230769230769231,\n",
969
+ " 'eval_bulvar_support': 13,\n",
970
+ " 'eval_cadde_f1': 0.8067226890756302,\n",
971
+ " 'eval_cadde_recall': 0.8421052631578947,\n",
972
+ " 'eval_cadde_precision': 0.7741935483870968,\n",
973
+ " 'eval_cadde_support': 57,\n",
974
+ " 'eval_diskapino_f1': 0.7083333333333334,\n",
975
+ " 'eval_diskapino_recall': 0.7285714285714285,\n",
976
+ " 'eval_diskapino_precision': 0.6891891891891891,\n",
977
+ " 'eval_diskapino_support': 70,\n",
978
+ " 'eval_ilce_f1': 0.9218106995884773,\n",
979
+ " 'eval_ilce_recall': 0.9572649572649573,\n",
980
+ " 'eval_ilce_precision': 0.8888888888888888,\n",
981
+ " 'eval_ilce_support': 117,\n",
982
+ " 'eval_isim_f1': 0.8793103448275862,\n",
983
+ " 'eval_isim_recall': 0.9026548672566371,\n",
984
+ " 'eval_isim_precision': 0.8571428571428571,\n",
985
+ " 'eval_isim_support': 113,\n",
986
+ " 'eval_mahalle_f1': 0.7903225806451613,\n",
987
+ " 'eval_mahalle_recall': 0.8166666666666667,\n",
988
+ " 'eval_mahalle_precision': 0.765625,\n",
989
+ " 'eval_mahalle_support': 120,\n",
990
+ " 'eval_sehir_f1': 0.9724137931034483,\n",
991
+ " 'eval_sehir_recall': 0.9657534246575342,\n",
992
+ " 'eval_sehir_precision': 0.9791666666666666,\n",
993
+ " 'eval_sehir_support': 146,\n",
994
+ " 'eval_site_f1': 0.6875000000000001,\n",
995
+ " 'eval_site_recall': 0.6111111111111112,\n",
996
+ " 'eval_site_precision': 0.7857142857142857,\n",
997
+ " 'eval_site_support': 18,\n",
998
+ " 'eval_sokak_f1': 0.7301587301587302,\n",
999
+ " 'eval_sokak_recall': 0.7419354838709677,\n",
1000
+ " 'eval_sokak_precision': 0.71875,\n",
1001
+ " 'eval_sokak_support': 62,\n",
1002
+ " 'eval_soyisim_f1': 0.9441624365482234,\n",
1003
+ " 'eval_soyisim_recall': 0.9489795918367347,\n",
1004
+ " 'eval_soyisim_precision': 0.9393939393939394,\n",
1005
+ " 'eval_soyisim_support': 98,\n",
1006
+ " 'eval_telefonno_f1': 0.9935483870967742,\n",
1007
+ " 'eval_telefonno_recall': 1.0,\n",
1008
+ " 'eval_telefonno_precision': 0.9871794871794872,\n",
1009
+ " 'eval_telefonno_support': 77,\n",
1010
+ " 'eval_runtime': 0.3493,\n",
1011
+ " 'eval_samples_per_second': 369.308,\n",
1012
+ " 'eval_steps_per_second': 14.314,\n",
1013
+ " 'epoch': 5.0}"
1014
+ ]
1015
+ },
1016
+ "execution_count": 24,
1017
+ "metadata": {},
1018
+ "output_type": "execute_result"
1019
+ }
1020
+ ],
1021
+ "source": [
1022
+ "results"
1023
+ ]
1024
+ },
1025
+ {
1026
+ "cell_type": "code",
1027
+ "execution_count": 18,
1028
+ "id": "922a7237",
1029
+ "metadata": {},
1030
+ "outputs": [
1031
+ {
1032
+ "data": {
1033
+ "text/html": [
1034
+ "<div>\n",
1035
+ "<style scoped>\n",
1036
+ " .dataframe tbody tr th:only-of-type {\n",
1037
+ " vertical-align: middle;\n",
1038
+ " }\n",
1039
+ "\n",
1040
+ " .dataframe tbody tr th {\n",
1041
+ " vertical-align: top;\n",
1042
+ " }\n",
1043
+ "\n",
1044
+ " .dataframe thead th {\n",
1045
+ " text-align: right;\n",
1046
+ " }\n",
1047
+ "</style>\n",
1048
+ "<table border=\"1\" class=\"dataframe\">\n",
1049
+ " <thead>\n",
1050
+ " <tr style=\"text-align: right;\">\n",
1051
+ " <th></th>\n",
1052
+ " <th>support</th>\n",
1053
+ " <th>precision</th>\n",
1054
+ " <th>recall</th>\n",
1055
+ " <th>f1</th>\n",
1056
+ " <th>accuracy</th>\n",
1057
+ " </tr>\n",
1058
+ " </thead>\n",
1059
+ " <tbody>\n",
1060
+ " <tr>\n",
1061
+ " <th>overall</th>\n",
1062
+ " <td>957</td>\n",
1063
+ " <td>0.84</td>\n",
1064
+ " <td>0.88</td>\n",
1065
+ " <td>0.86</td>\n",
1066
+ " <td>0.94</td>\n",
1067
+ " </tr>\n",
1068
+ " <tr>\n",
1069
+ " <th>bina</th>\n",
1070
+ " <td>66</td>\n",
1071
+ " <td>0.66</td>\n",
1072
+ " <td>0.74</td>\n",
1073
+ " <td>0.70</td>\n",
1074
+ " <td>NaN</td>\n",
1075
+ " </tr>\n",
1076
+ " <tr>\n",
1077
+ " <th>bulvar</th>\n",
1078
+ " <td>13</td>\n",
1079
+ " <td>0.92</td>\n",
1080
+ " <td>0.92</td>\n",
1081
+ " <td>0.92</td>\n",
1082
+ " <td>NaN</td>\n",
1083
+ " </tr>\n",
1084
+ " <tr>\n",
1085
+ " <th>cadde</th>\n",
1086
+ " <td>57</td>\n",
1087
+ " <td>0.77</td>\n",
1088
+ " <td>0.84</td>\n",
1089
+ " <td>0.81</td>\n",
1090
+ " <td>NaN</td>\n",
1091
+ " </tr>\n",
1092
+ " <tr>\n",
1093
+ " <th>diskapino</th>\n",
1094
+ " <td>70</td>\n",
1095
+ " <td>0.69</td>\n",
1096
+ " <td>0.73</td>\n",
1097
+ " <td>0.71</td>\n",
1098
+ " <td>NaN</td>\n",
1099
+ " </tr>\n",
1100
+ " <tr>\n",
1101
+ " <th>ilce</th>\n",
1102
+ " <td>117</td>\n",
1103
+ " <td>0.89</td>\n",
1104
+ " <td>0.96</td>\n",
1105
+ " <td>0.92</td>\n",
1106
+ " <td>NaN</td>\n",
1107
+ " </tr>\n",
1108
+ " <tr>\n",
1109
+ " <th>isim</th>\n",
1110
+ " <td>113</td>\n",
1111
+ " <td>0.86</td>\n",
1112
+ " <td>0.90</td>\n",
1113
+ " <td>0.88</td>\n",
1114
+ " <td>NaN</td>\n",
1115
+ " </tr>\n",
1116
+ " <tr>\n",
1117
+ " <th>mahalle</th>\n",
1118
+ " <td>120</td>\n",
1119
+ " <td>0.77</td>\n",
1120
+ " <td>0.82</td>\n",
1121
+ " <td>0.79</td>\n",
1122
+ " <td>NaN</td>\n",
1123
+ " </tr>\n",
1124
+ " <tr>\n",
1125
+ " <th>sehir</th>\n",
1126
+ " <td>146</td>\n",
1127
+ " <td>0.98</td>\n",
1128
+ " <td>0.97</td>\n",
1129
+ " <td>0.97</td>\n",
1130
+ " <td>NaN</td>\n",
1131
+ " </tr>\n",
1132
+ " <tr>\n",
1133
+ " <th>site</th>\n",
1134
+ " <td>18</td>\n",
1135
+ " <td>0.79</td>\n",
1136
+ " <td>0.61</td>\n",
1137
+ " <td>0.69</td>\n",
1138
+ " <td>NaN</td>\n",
1139
+ " </tr>\n",
1140
+ " <tr>\n",
1141
+ " <th>sokak</th>\n",
1142
+ " <td>62</td>\n",
1143
+ " <td>0.72</td>\n",
1144
+ " <td>0.74</td>\n",
1145
+ " <td>0.73</td>\n",
1146
+ " <td>NaN</td>\n",
1147
+ " </tr>\n",
1148
+ " <tr>\n",
1149
+ " <th>soyisim</th>\n",
1150
+ " <td>98</td>\n",
1151
+ " <td>0.94</td>\n",
1152
+ " <td>0.95</td>\n",
1153
+ " <td>0.94</td>\n",
1154
+ " <td>NaN</td>\n",
1155
+ " </tr>\n",
1156
+ " <tr>\n",
1157
+ " <th>telefonno</th>\n",
1158
+ " <td>77</td>\n",
1159
+ " <td>0.99</td>\n",
1160
+ " <td>1.00</td>\n",
1161
+ " <td>0.99</td>\n",
1162
+ " <td>NaN</td>\n",
1163
+ " </tr>\n",
1164
+ " </tbody>\n",
1165
+ "</table>\n",
1166
+ "</div>"
1167
+ ],
1168
+ "text/plain": [
1169
+ " support precision recall f1 accuracy\n",
1170
+ "overall 957 0.84 0.88 0.86 0.94\n",
1171
+ "bina 66 0.66 0.74 0.70 NaN\n",
1172
+ "bulvar 13 0.92 0.92 0.92 NaN\n",
1173
+ "cadde 57 0.77 0.84 0.81 NaN\n",
1174
+ "diskapino 70 0.69 0.73 0.71 NaN\n",
1175
+ "ilce 117 0.89 0.96 0.92 NaN\n",
1176
+ "isim 113 0.86 0.90 0.88 NaN\n",
1177
+ "mahalle 120 0.77 0.82 0.79 NaN\n",
1178
+ "sehir 146 0.98 0.97 0.97 NaN\n",
1179
+ "site 18 0.79 0.61 0.69 NaN\n",
1180
+ "sokak 62 0.72 0.74 0.73 NaN\n",
1181
+ "soyisim 98 0.94 0.95 0.94 NaN\n",
1182
+ "telefonno 77 0.99 1.00 0.99 NaN"
1183
+ ]
1184
+ },
1185
+ "execution_count": 18,
1186
+ "metadata": {},
1187
+ "output_type": "execute_result"
1188
+ }
1189
+ ],
1190
+ "source": [
1191
+ "structured_results = defaultdict(dict)\n",
1192
+ "structured_results[\"overall\"][\"support\"]=0\n",
1193
+ "for x, y in results.items():\n",
1194
+ " if len(x.split(\"_\"))==3:\n",
1195
+ " structured_results[x.split(\"_\")[1]][x.split(\"_\")[2]] = y\n",
1196
+ " if x.split(\"_\")[2]==\"support\":\n",
1197
+ " structured_results[\"overall\"][\"support\"]+=y\n",
1198
+ "results_pd = pd.DataFrame(structured_results).T\n",
1199
+ "results_pd.support = results_pd.support.astype(int)\n",
1200
+ "results_pd.round(2)"
1201
+ ]
1202
+ },
1203
+ {
1204
+ "cell_type": "markdown",
1205
+ "id": "3c3de283",
1206
+ "metadata": {},
1207
+ "source": [
1208
+ "## Predictions"
1209
+ ]
1210
+ },
1211
+ {
1212
+ "cell_type": "code",
1213
+ "execution_count": 19,
1214
+ "id": "ed165edb",
1215
+ "metadata": {},
1216
+ "outputs": [],
1217
+ "source": [
1218
+ "from transformers import pipeline\n",
1219
+ "nlp = pipeline(\"ner\", model=model.to(device), tokenizer=tokenizer, aggregation_strategy=\"first\", device=0 if device==\"cuda\" else -1)"
1220
+ ]
1221
+ },
1222
+ {
1223
+ "cell_type": "code",
1224
+ "execution_count": 20,
1225
+ "id": "0e350503",
1226
+ "metadata": {},
1227
+ "outputs": [],
1228
+ "source": [
1229
+ "# Source: https://www.thepythoncode.com/article/named-entity-recognition-using-transformers-and-spacy\n",
1230
+ "def get_entities_html(text, ner_result, title=None):\n",
1231
+ " \"\"\"Visualize NER with the help of SpaCy\"\"\"\n",
1232
+ " ents = []\n",
1233
+ " for ent in ner_result:\n",
1234
+ " e = {}\n",
1235
+ " # add the start and end positions of the entity\n",
1236
+ " e[\"start\"] = ent[\"start\"]\n",
1237
+ " e[\"end\"] = ent[\"end\"]\n",
1238
+ " # add the score if you want in the label\n",
1239
+ " # e[\"label\"] = f\"{ent[\"entity\"]}-{ent['score']:.2f}\"\n",
1240
+ " e[\"label\"] = ent[\"entity_group\"]\n",
1241
+ " if ents and -1 <= ent[\"start\"] - ents[-1][\"end\"] <= 1 and ents[-1][\"label\"] == e[\"label\"]:\n",
1242
+ " # if the current entity is shared with previous entity\n",
1243
+ " # simply extend the entity end position instead of adding a new one\n",
1244
+ " ents[-1][\"end\"] = e[\"end\"]\n",
1245
+ " continue\n",
1246
+ " ents.append(e)\n",
1247
+ " # construct data required for displacy.render() method\n",
1248
+ " render_data = [\n",
1249
+ " {\n",
1250
+ " \"text\": text,\n",
1251
+ " \"ents\": ents,\n",
1252
+ " \"title\": title,\n",
1253
+ " }\n",
1254
+ " ]\n",
1255
+ " spacy.displacy.render(render_data, style=\"ent\", manual=True, jupyter=True)"
1256
+ ]
1257
+ },
1258
+ {
1259
+ "cell_type": "code",
1260
+ "execution_count": 21,
1261
+ "id": "f98a6902",
1262
+ "metadata": {},
1263
+ "outputs": [
1264
+ {
1265
+ "data": {
1266
+ "text/html": [
1267
+ "<span class=\"tex2jax_ignore\"><div class=\"entities\" style=\"line-height: 2.5; direction: ltr\">Lütfen yardım \n",
1268
+ "<mark class=\"entity\" style=\"background: #ddd; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
1269
+ " Akevler\n",
1270
+ " <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem\">mahalle</span>\n",
1271
+ "</mark>\n",
1272
+ " mahallesi \n",
1273
+ "<mark class=\"entity\" style=\"background: #ddd; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
1274
+ " Rüzgar\n",
1275
+ " <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem\">sokak</span>\n",
1276
+ "</mark>\n",
1277
+ " sokak \n",
1278
+ "<mark class=\"entity\" style=\"background: #ddd; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
1279
+ " Tuncay\n",
1280
+ " <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem\">bina</span>\n",
1281
+ "</mark>\n",
1282
+ " apartmanı zemin kat \n",
1283
+ "<mark class=\"entity\" style=\"background: #ddd; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
1284
+ " Antakya\n",
1285
+ " <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem\">ilce</span>\n",
1286
+ "</mark>\n",
1287
+ " akrabalarım göçük altında #hatay #Afad</div></span>"
1288
+ ],
1289
+ "text/plain": [
1290
+ "<IPython.core.display.HTML object>"
1291
+ ]
1292
+ },
1293
+ "metadata": {},
1294
+ "output_type": "display_data"
1295
+ }
1296
+ ],
1297
+ "source": [
1298
+ "sentence = \"\"\"Lütfen yardım Akevler mahallesi Rüzgar sokak Tuncay apartmanı zemin kat Antakya akrabalarım göçük altında #hatay #Afad\"\"\"\n",
1299
+ "\n",
1300
+ "get_entities_html(sentence, nlp(sentence))"
1301
+ ]
1302
+ },
1303
+ {
1304
+ "cell_type": "code",
1305
+ "execution_count": 22,
1306
+ "id": "80b823ff",
1307
+ "metadata": {},
1308
+ "outputs": [
1309
+ {
1310
+ "data": {
1311
+ "text/html": [
1312
+ "<span class=\"tex2jax_ignore\"><div class=\"entities\" style=\"line-height: 2.5; direction: ltr\">\n",
1313
+ "<mark class=\"entity\" style=\"background: #ddd; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
1314
+ " Kahramanmaraş\n",
1315
+ " <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem\">sehir</span>\n",
1316
+ "</mark>\n",
1317
+ " \n",
1318
+ "<mark class=\"entity\" style=\"background: #ddd; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
1319
+ " merkez\n",
1320
+ " <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem\">ilce</span>\n",
1321
+ "</mark>\n",
1322
+ " \n",
1323
+ "<mark class=\"entity\" style=\"background: #ddd; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
1324
+ " Şazibey\n",
1325
+ " <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem\">mahalle</span>\n",
1326
+ "</mark>\n",
1327
+ " Mahallesi \n",
1328
+ "<mark class=\"entity\" style=\"background: #ddd; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
1329
+ " Ebrar\n",
1330
+ " <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem\">site</span>\n",
1331
+ "</mark>\n",
1332
+ " Sitesi \n",
1333
+ "<mark class=\"entity\" style=\"background: #ddd; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
1334
+ " Z\n",
1335
+ " <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem\">bina</span>\n",
1336
+ "</mark>\n",
1337
+ " blok arka tarafı için acil en az 150 tonluk vinç lazım lütfen paylaşır mısınız</div></span>"
1338
+ ],
1339
+ "text/plain": [
1340
+ "<IPython.core.display.HTML object>"
1341
+ ]
1342
+ },
1343
+ "metadata": {},
1344
+ "output_type": "display_data"
1345
+ }
1346
+ ],
1347
+ "source": [
1348
+ "sentence = \" \".join(dataset[\"train\"][433][\"tokens\"])\n",
1349
+ "get_entities_html(sentence, nlp(sentence))"
1350
+ ]
1351
+ }
1352
+ ],
1353
+ "metadata": {
1354
+ "kernelspec": {
1355
+ "display_name": "Python 3 (ipykernel)",
1356
+ "language": "python",
1357
+ "name": "python3"
1358
+ },
1359
+ "language_info": {
1360
+ "codemirror_mode": {
1361
+ "name": "ipython",
1362
+ "version": 3
1363
+ },
1364
+ "file_extension": ".py",
1365
+ "mimetype": "text/x-python",
1366
+ "name": "python",
1367
+ "nbconvert_exporter": "python",
1368
+ "pygments_lexer": "ipython3",
1369
+ "version": "3.9.12"
1370
+ }
1371
+ },
1372
+ "nbformat": 4,
1373
+ "nbformat_minor": 5
1374
+ }