{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "c88f989c",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "os.environ['CUDA_VISIBLE_DEVICES']='7'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "bfdbe247",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2023-02-26 02:35:07.275938: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA\n",
      "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
      "2023-02-26 02:35:07.472394: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory\n",
      "2023-02-26 02:35:07.472434: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.\n",
      "2023-02-26 02:35:07.503598: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
      "2023-02-26 02:35:08.603575: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory\n",
      "2023-02-26 02:35:08.603678: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory\n",
      "2023-02-26 02:35:08.603689: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.\n",
      "2023-02-26 02:35:15.326595: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory\n",
      "2023-02-26 02:35:15.326728: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublas.so.11'; dlerror: libcublas.so.11: cannot open shared object file: No such file or directory\n",
      "2023-02-26 02:35:15.326831: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublasLt.so.11'; dlerror: libcublasLt.so.11: cannot open shared object file: No such file or directory\n",
      "2023-02-26 02:35:15.327013: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcusolver.so.11'; dlerror: libcusolver.so.11: cannot open shared object file: No such file or directory\n",
      "2023-02-26 02:35:15.327108: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcusparse.so.11'; dlerror: libcusparse.so.11: cannot open shared object file: No such file or directory\n",
      "2023-02-26 02:35:15.327205: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudnn.so.8'; dlerror: libcudnn.so.8: cannot open shared object file: No such file or directory\n",
      "2023-02-26 02:35:15.327224: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1934] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.\n",
      "Skipping registering GPU devices...\n"
     ]
    }
   ],
   "source": [
    "from transformers import AutoTokenizer\n",
    "import re\n",
    "import numpy as np\n",
    "from random import Random\n",
    "import torch\n",
    "import pandas as pd\n",
    "import spacy\n",
    "import random\n",
    "from datasets import load_dataset\n",
    "from transformers import (\n",
    "    AutoModelForTokenClassification,\n",
    "    AutoTokenizer,\n",
    "    DataCollatorForTokenClassification,\n",
    "    TrainingArguments,\n",
    "    Trainer,\n",
    "    set_seed)\n",
    "import numpy as np\n",
    "import datasets\n",
    "from collections import defaultdict\n",
    "from datasets import load_metric"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "7a916e9f",
   "metadata": {},
   "outputs": [],
   "source": [
    "# !pip install seqeval"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "4b0590b7",
   "metadata": {},
   "outputs": [],
   "source": [
    "per_device_train_batch_size = 16\n",
    "per_device_eval_batch_size = 32\n",
    "num_train_epochs = 5\n",
    "weight_decay = 0.1\n",
    "warmup_ratio = 0.1\n",
    "learning_rate = 5e-5\n",
    "load_best_model_at_end = True\n",
    "output_dir = \"../akoksal/earthquake_ner_models/\"\n",
    "old_data_path = \"annotated_address_dataset_07022023_766train_192test/\"\n",
    "data_path = \"deprem-private/ner_v12\"\n",
    "cache_dir = \"../akoksal/hf_cache\"\n",
    "saved_models_path = \"../akoksal/earthquake_ner_models/\"\n",
    "device = \"cuda\"\n",
    "seed = 42\n",
    "model_names = [\"dbmdz/bert-base-turkish-cased\",\n",
    "              \"dbmdz/electra-base-turkish-mc4-cased-discriminator\",\n",
    "              \"dbmdz/bert-base-turkish-128k-cased\",\n",
    "              \"dbmdz/convbert-base-turkish-cased\",\n",
    "              \"bert-base-multilingual-cased\",\n",
    "              \"xlm-roberta-base\"]\n",
    "model_name = model_names[2]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "9aeb3dbe",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'dbmdz/bert-base-turkish-128k-cased'"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model_name"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "ffeb73e4",
   "metadata": {},
   "outputs": [],
   "source": [
    "set_seed(seed)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "a876c516",
   "metadata": {},
   "outputs": [],
   "source": [
    "id2label = {\n",
    "    0: \"O\",\n",
    "    1: \"B-bina\",\n",
    "    2: \"I-bina\",\n",
    "    3: \"B-bulvar\",\n",
    "    4: \"I-bulvar\",\n",
    "    5: \"B-cadde\",\n",
    "    6: \"I-cadde\",\n",
    "    7: \"B-diskapino\",\n",
    "    8: \"I-diskapino\",\n",
    "    9: \"B-ilce\",\n",
    "    10: \"I-ilce\",\n",
    "    11: \"B-isim\",\n",
    "    12: \"I-isim\",\n",
    "    13: \"B-mahalle\",\n",
    "    14: \"I-mahalle\",\n",
    "    15: \"B-sehir\",\n",
    "    16: \"I-sehir\",\n",
    "    17: \"B-site\",\n",
    "    18: \"I-site\",\n",
    "    19: \"B-sokak\",\n",
    "    20: \"I-sokak\",\n",
    "    21: \"B-soyisim\",\n",
    "    22: \"I-soyisim\",\n",
    "    23: \"B-telefonno\",\n",
    "    24: \"I-telefonno\",\n",
    "}\n",
    "\n",
    "label2id = {label: idx for idx, label in id2label.items()}\n",
    "label_names = list(label2id.keys())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "2e0caffc",
   "metadata": {},
   "outputs": [],
   "source": [
    "# from huggingface_hub import login\n",
    "# login()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "c74850f9",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Some weights of the model checkpoint at dbmdz/bert-base-turkish-128k-cased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']\n",
      "- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
      "- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
      "Some weights of BertForTokenClassification were not initialized from the model checkpoint at dbmdz/bert-base-turkish-128k-cased and are newly initialized: ['classifier.bias', 'classifier.weight']\n",
      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
     ]
    }
   ],
   "source": [
    "tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
    "model = AutoModelForTokenClassification.from_pretrained(model_name,\n",
    "                                                        num_labels=len(label_names),\n",
    "                                                        id2label=id2label,\n",
    "                                                        cache_dir=cache_dir).to(device)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "4c1fe653",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using custom data configuration deprem-private--ner_v12-e2f61c5a18a7a738\n",
      "Found cached dataset text (/mounts/Users/cisintern/akoksal/.cache/huggingface/datasets/deprem-private___text/deprem-private--ner_v12-e2f61c5a18a7a738/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2)\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "22bc5f5f97204b41b2bc5dc3b71036e1",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/3 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "raw_dataset = datasets.load_dataset(\"deprem-private/ner_v12\", use_auth_token=True)\n",
    "\n",
    "new_dataset_json = {}\n",
    "for split in [\"train\", \"validation\", \"test\"]:\n",
    "    ids = []\n",
    "    sentences = []\n",
    "    labels = []\n",
    "    ids = []\n",
    "    cur_idx = 0\n",
    "    unique_labels = set()\n",
    "    temp_sent = []\n",
    "    temp_labels = []\n",
    "    for word in raw_dataset[split][\"text\"]:\n",
    "        \n",
    "        if word!=\"\":\n",
    "            temp_sent.append((word.split()[0]))\n",
    "            temp_labels.append(label2id[(word.split()[1])])\n",
    "        else:\n",
    "            sentences.append(temp_sent)\n",
    "            labels.append(temp_labels)\n",
    "            ids.append(cur_idx)\n",
    "            cur_idx+=1\n",
    "            temp_sent = []\n",
    "            temp_labels = []\n",
    "    new_dataset_json[split] = {\"tokens\":sentences, \"ner_tags\":labels, \"ids\":ids}\n",
    "\n",
    "dataset = datasets.DatasetDict()\n",
    "# using your `Dict` object\n",
    "for k,v in new_dataset_json.items():\n",
    "    dataset[k] = datasets.Dataset.from_dict(v)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "65a66af9",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "a403f5fadb3041f4b18acc7ec41a2d36",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/1 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "e2410f6106514cfd8207d8b42748c66d",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/1 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "227e163e07b2414da9abdbe11cb0c6bf",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/1 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# dataset = datasets.load_from_disk(old_data_path)\n",
    "def tokenize_and_align_labels(examples):\n",
    "    tokenized_inputs = tokenizer(examples[\"tokens\"], truncation=True, is_split_into_words=True)\n",
    "\n",
    "    labels = []\n",
    "    for i, label in enumerate(examples[f\"ner_tags\"]):\n",
    "        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.\n",
    "        previous_word_idx = None\n",
    "        label_ids = []\n",
    "        for word_idx in word_ids:  # Set the special tokens to -100.\n",
    "            if word_idx is None:\n",
    "                label_ids.append(-100)\n",
    "            elif word_idx != previous_word_idx:  # Only label the first token of a given word.\n",
    "                label_ids.append(label[word_idx])\n",
    "            else:\n",
    "                label_ids.append(-100)\n",
    "            previous_word_idx = word_idx\n",
    "        labels.append(label_ids)\n",
    "\n",
    "    tokenized_inputs[\"labels\"] = labels\n",
    "    return tokenized_inputs\n",
    "\n",
    "tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "6b43934d",
   "metadata": {},
   "outputs": [],
   "source": [
    "data_collator = DataCollatorForTokenClassification(tokenizer)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "c24f52db",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_2652487/885599324.py:1: FutureWarning: load_metric is deprecated and will be removed in the next major version of datasets. Use 'evaluate.load' instead, from the new library 🤗 Evaluate: https://huggingface.co/docs/evaluate\n",
      "  metric = load_metric(\"seqeval\")\n"
     ]
    }
   ],
   "source": [
    "metric = load_metric(\"seqeval\")\n",
    "def compute_metrics(p):\n",
    "    predictions, labels = p\n",
    "    predictions = np.argmax(predictions, axis=2)\n",
    "\n",
    "    # Remove ignored index (special tokens)\n",
    "    true_predictions = [\n",
    "        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]\n",
    "        for prediction, label in zip(predictions, labels)\n",
    "    ]\n",
    "    true_labels = [\n",
    "        [label_names[l] for (p, l) in zip(prediction, label) if l != -100]\n",
    "        for prediction, label in zip(predictions, labels)\n",
    "    ]\n",
    "\n",
    "    results = metric.compute(predictions=true_predictions, references=true_labels)\n",
    "    flattened_results = {\n",
    "        \"overall_precision\": results[\"overall_precision\"],\n",
    "        \"overall_recall\": results[\"overall_recall\"],\n",
    "        \"overall_f1\": results[\"overall_f1\"],\n",
    "        \"overall_accuracy\": results[\"overall_accuracy\"],\n",
    "    }\n",
    "    for k in results.keys():\n",
    "        if(k not in flattened_results.keys()):\n",
    "            flattened_results[k+\"_f1\"]=results[k][\"f1\"]\n",
    "            flattened_results[k+\"_recall\"]=results[k][\"recall\"]\n",
    "            flattened_results[k+\"_precision\"]=results[k][\"precision\"]\n",
    "            flattened_results[k+\"_support\"]=results[k][\"number\"]\n",
    "\n",
    "    return flattened_results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "a955fd51",
   "metadata": {},
   "outputs": [],
   "source": [
    "training_args = TrainingArguments(\n",
    "    output_dir=saved_models_path,\n",
    "    evaluation_strategy=\"epoch\",\n",
    "    learning_rate=learning_rate,\n",
    "    per_device_train_batch_size=per_device_train_batch_size,\n",
    "    per_device_eval_batch_size=per_device_eval_batch_size,\n",
    "    num_train_epochs=num_train_epochs,\n",
    "    warmup_ratio=warmup_ratio,\n",
    "    weight_decay=weight_decay,\n",
    "    run_name = \"turkish_ner\",\n",
    "    save_strategy='epoch',\n",
    "    logging_strategy=\"epoch\",\n",
    "    save_total_limit=3,\n",
    "    load_best_model_at_end=load_best_model_at_end,\n",
    "    \n",
    ")\n",
    "trainer = Trainer(\n",
    "    model=model,\n",
    "    args=training_args,\n",
    "    train_dataset=tokenized_dataset[\"train\"],\n",
    "    eval_dataset=tokenized_dataset[\"validation\"],\n",
    "    data_collator=data_collator,\n",
    "    tokenizer=tokenizer,\n",
    "    compute_metrics=compute_metrics\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "9f78efdc",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "The following columns in the training set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens, ids, ner_tags. If tokens, ids, ner_tags are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.\n",
      "/mounts/work/akoksal/anaconda3/envs/lmbias/lib/python3.9/site-packages/transformers/optimization.py:306: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n",
      "  warnings.warn(\n",
      "***** Running training *****\n",
      "  Num examples = 799\n",
      "  Num Epochs = 5\n",
      "  Instantaneous batch size per device = 16\n",
      "  Total train batch size (w. parallel, distributed & accumulation) = 16\n",
      "  Gradient Accumulation steps = 1\n",
      "  Total optimization steps = 250\n",
      "  Number of trainable parameters = 183773977\n",
      "You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "\n",
       "    <div>\n",
       "      \n",
       "      <progress value='250' max='250' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
       "      [250/250 01:12, Epoch 5/5]\n",
       "    </div>\n",
       "    <table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       " <tr style=\"text-align: left;\">\n",
       "      <th>Epoch</th>\n",
       "      <th>Training Loss</th>\n",
       "      <th>Validation Loss</th>\n",
       "      <th>Overall Precision</th>\n",
       "      <th>Overall Recall</th>\n",
       "      <th>Overall F1</th>\n",
       "      <th>Overall Accuracy</th>\n",
       "      <th>Bina F1</th>\n",
       "      <th>Bina Recall</th>\n",
       "      <th>Bina Precision</th>\n",
       "      <th>Bina Support</th>\n",
       "      <th>Bulvar F1</th>\n",
       "      <th>Bulvar Recall</th>\n",
       "      <th>Bulvar Precision</th>\n",
       "      <th>Bulvar Support</th>\n",
       "      <th>Cadde F1</th>\n",
       "      <th>Cadde Recall</th>\n",
       "      <th>Cadde Precision</th>\n",
       "      <th>Cadde Support</th>\n",
       "      <th>Diskapino F1</th>\n",
       "      <th>Diskapino Recall</th>\n",
       "      <th>Diskapino Precision</th>\n",
       "      <th>Diskapino Support</th>\n",
       "      <th>Ilce F1</th>\n",
       "      <th>Ilce Recall</th>\n",
       "      <th>Ilce Precision</th>\n",
       "      <th>Ilce Support</th>\n",
       "      <th>Isim F1</th>\n",
       "      <th>Isim Recall</th>\n",
       "      <th>Isim Precision</th>\n",
       "      <th>Isim Support</th>\n",
       "      <th>Mahalle F1</th>\n",
       "      <th>Mahalle Recall</th>\n",
       "      <th>Mahalle Precision</th>\n",
       "      <th>Mahalle Support</th>\n",
       "      <th>Sehir F1</th>\n",
       "      <th>Sehir Recall</th>\n",
       "      <th>Sehir Precision</th>\n",
       "      <th>Sehir Support</th>\n",
       "      <th>Site F1</th>\n",
       "      <th>Site Recall</th>\n",
       "      <th>Site Precision</th>\n",
       "      <th>Site Support</th>\n",
       "      <th>Sokak F1</th>\n",
       "      <th>Sokak Recall</th>\n",
       "      <th>Sokak Precision</th>\n",
       "      <th>Sokak Support</th>\n",
       "      <th>Soyisim F1</th>\n",
       "      <th>Soyisim Recall</th>\n",
       "      <th>Soyisim Precision</th>\n",
       "      <th>Soyisim Support</th>\n",
       "      <th>Telefonno F1</th>\n",
       "      <th>Telefonno Recall</th>\n",
       "      <th>Telefonno Precision</th>\n",
       "      <th>Telefonno Support</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>1.349500</td>\n",
       "      <td>0.357321</td>\n",
       "      <td>0.783270</td>\n",
       "      <td>0.828974</td>\n",
       "      <td>0.805474</td>\n",
       "      <td>0.908936</td>\n",
       "      <td>0.600000</td>\n",
       "      <td>0.705882</td>\n",
       "      <td>0.521739</td>\n",
       "      <td>34</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>5</td>\n",
       "      <td>0.588235</td>\n",
       "      <td>0.833333</td>\n",
       "      <td>0.454545</td>\n",
       "      <td>24</td>\n",
       "      <td>0.769231</td>\n",
       "      <td>0.892857</td>\n",
       "      <td>0.675676</td>\n",
       "      <td>28</td>\n",
       "      <td>0.830508</td>\n",
       "      <td>0.816667</td>\n",
       "      <td>0.844828</td>\n",
       "      <td>60</td>\n",
       "      <td>0.888889</td>\n",
       "      <td>0.926829</td>\n",
       "      <td>0.853933</td>\n",
       "      <td>82</td>\n",
       "      <td>0.750000</td>\n",
       "      <td>0.792453</td>\n",
       "      <td>0.711864</td>\n",
       "      <td>53</td>\n",
       "      <td>0.867133</td>\n",
       "      <td>0.861111</td>\n",
       "      <td>0.873239</td>\n",
       "      <td>72</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>6</td>\n",
       "      <td>0.750000</td>\n",
       "      <td>0.620690</td>\n",
       "      <td>0.947368</td>\n",
       "      <td>29</td>\n",
       "      <td>0.900000</td>\n",
       "      <td>0.887324</td>\n",
       "      <td>0.913043</td>\n",
       "      <td>71</td>\n",
       "      <td>0.985075</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.970588</td>\n",
       "      <td>33</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>0.264700</td>\n",
       "      <td>0.220467</td>\n",
       "      <td>0.885149</td>\n",
       "      <td>0.899396</td>\n",
       "      <td>0.892216</td>\n",
       "      <td>0.944792</td>\n",
       "      <td>0.782609</td>\n",
       "      <td>0.794118</td>\n",
       "      <td>0.771429</td>\n",
       "      <td>34</td>\n",
       "      <td>0.666667</td>\n",
       "      <td>0.800000</td>\n",
       "      <td>0.571429</td>\n",
       "      <td>5</td>\n",
       "      <td>0.875000</td>\n",
       "      <td>0.875000</td>\n",
       "      <td>0.875000</td>\n",
       "      <td>24</td>\n",
       "      <td>0.862069</td>\n",
       "      <td>0.892857</td>\n",
       "      <td>0.833333</td>\n",
       "      <td>28</td>\n",
       "      <td>0.894309</td>\n",
       "      <td>0.916667</td>\n",
       "      <td>0.873016</td>\n",
       "      <td>60</td>\n",
       "      <td>0.884848</td>\n",
       "      <td>0.890244</td>\n",
       "      <td>0.879518</td>\n",
       "      <td>82</td>\n",
       "      <td>0.897196</td>\n",
       "      <td>0.905660</td>\n",
       "      <td>0.888889</td>\n",
       "      <td>53</td>\n",
       "      <td>0.915493</td>\n",
       "      <td>0.902778</td>\n",
       "      <td>0.928571</td>\n",
       "      <td>72</td>\n",
       "      <td>0.181818</td>\n",
       "      <td>0.166667</td>\n",
       "      <td>0.200000</td>\n",
       "      <td>6</td>\n",
       "      <td>0.949153</td>\n",
       "      <td>0.965517</td>\n",
       "      <td>0.933333</td>\n",
       "      <td>29</td>\n",
       "      <td>0.950355</td>\n",
       "      <td>0.943662</td>\n",
       "      <td>0.957143</td>\n",
       "      <td>71</td>\n",
       "      <td>0.985075</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.970588</td>\n",
       "      <td>33</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>0.158700</td>\n",
       "      <td>0.219565</td>\n",
       "      <td>0.876768</td>\n",
       "      <td>0.873239</td>\n",
       "      <td>0.875000</td>\n",
       "      <td>0.940808</td>\n",
       "      <td>0.805556</td>\n",
       "      <td>0.852941</td>\n",
       "      <td>0.763158</td>\n",
       "      <td>34</td>\n",
       "      <td>0.666667</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>5</td>\n",
       "      <td>0.880000</td>\n",
       "      <td>0.916667</td>\n",
       "      <td>0.846154</td>\n",
       "      <td>24</td>\n",
       "      <td>0.827586</td>\n",
       "      <td>0.857143</td>\n",
       "      <td>0.800000</td>\n",
       "      <td>28</td>\n",
       "      <td>0.881356</td>\n",
       "      <td>0.866667</td>\n",
       "      <td>0.896552</td>\n",
       "      <td>60</td>\n",
       "      <td>0.822785</td>\n",
       "      <td>0.792683</td>\n",
       "      <td>0.855263</td>\n",
       "      <td>82</td>\n",
       "      <td>0.886792</td>\n",
       "      <td>0.886792</td>\n",
       "      <td>0.886792</td>\n",
       "      <td>53</td>\n",
       "      <td>0.892086</td>\n",
       "      <td>0.861111</td>\n",
       "      <td>0.925373</td>\n",
       "      <td>72</td>\n",
       "      <td>0.400000</td>\n",
       "      <td>0.333333</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>6</td>\n",
       "      <td>0.881356</td>\n",
       "      <td>0.896552</td>\n",
       "      <td>0.866667</td>\n",
       "      <td>29</td>\n",
       "      <td>0.957143</td>\n",
       "      <td>0.943662</td>\n",
       "      <td>0.971014</td>\n",
       "      <td>71</td>\n",
       "      <td>0.985075</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.970588</td>\n",
       "      <td>33</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>0.115000</td>\n",
       "      <td>0.215329</td>\n",
       "      <td>0.897541</td>\n",
       "      <td>0.881288</td>\n",
       "      <td>0.889340</td>\n",
       "      <td>0.946500</td>\n",
       "      <td>0.857143</td>\n",
       "      <td>0.882353</td>\n",
       "      <td>0.833333</td>\n",
       "      <td>34</td>\n",
       "      <td>0.909091</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.833333</td>\n",
       "      <td>5</td>\n",
       "      <td>0.897959</td>\n",
       "      <td>0.916667</td>\n",
       "      <td>0.880000</td>\n",
       "      <td>24</td>\n",
       "      <td>0.862069</td>\n",
       "      <td>0.892857</td>\n",
       "      <td>0.833333</td>\n",
       "      <td>28</td>\n",
       "      <td>0.881356</td>\n",
       "      <td>0.866667</td>\n",
       "      <td>0.896552</td>\n",
       "      <td>60</td>\n",
       "      <td>0.810127</td>\n",
       "      <td>0.780488</td>\n",
       "      <td>0.842105</td>\n",
       "      <td>82</td>\n",
       "      <td>0.886792</td>\n",
       "      <td>0.886792</td>\n",
       "      <td>0.886792</td>\n",
       "      <td>53</td>\n",
       "      <td>0.890511</td>\n",
       "      <td>0.847222</td>\n",
       "      <td>0.938462</td>\n",
       "      <td>72</td>\n",
       "      <td>0.727273</td>\n",
       "      <td>0.666667</td>\n",
       "      <td>0.800000</td>\n",
       "      <td>6</td>\n",
       "      <td>0.950820</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.906250</td>\n",
       "      <td>29</td>\n",
       "      <td>0.949640</td>\n",
       "      <td>0.929577</td>\n",
       "      <td>0.970588</td>\n",
       "      <td>71</td>\n",
       "      <td>0.985075</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.970588</td>\n",
       "      <td>33</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>5</td>\n",
       "      <td>0.093800</td>\n",
       "      <td>0.231558</td>\n",
       "      <td>0.895492</td>\n",
       "      <td>0.879276</td>\n",
       "      <td>0.887310</td>\n",
       "      <td>0.945361</td>\n",
       "      <td>0.833333</td>\n",
       "      <td>0.882353</td>\n",
       "      <td>0.789474</td>\n",
       "      <td>34</td>\n",
       "      <td>0.909091</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.833333</td>\n",
       "      <td>5</td>\n",
       "      <td>0.880000</td>\n",
       "      <td>0.916667</td>\n",
       "      <td>0.846154</td>\n",
       "      <td>24</td>\n",
       "      <td>0.813559</td>\n",
       "      <td>0.857143</td>\n",
       "      <td>0.774194</td>\n",
       "      <td>28</td>\n",
       "      <td>0.888889</td>\n",
       "      <td>0.866667</td>\n",
       "      <td>0.912281</td>\n",
       "      <td>60</td>\n",
       "      <td>0.833333</td>\n",
       "      <td>0.792683</td>\n",
       "      <td>0.878378</td>\n",
       "      <td>82</td>\n",
       "      <td>0.895238</td>\n",
       "      <td>0.886792</td>\n",
       "      <td>0.903846</td>\n",
       "      <td>53</td>\n",
       "      <td>0.898551</td>\n",
       "      <td>0.861111</td>\n",
       "      <td>0.939394</td>\n",
       "      <td>72</td>\n",
       "      <td>0.727273</td>\n",
       "      <td>0.666667</td>\n",
       "      <td>0.800000</td>\n",
       "      <td>6</td>\n",
       "      <td>0.881356</td>\n",
       "      <td>0.896552</td>\n",
       "      <td>0.866667</td>\n",
       "      <td>29</td>\n",
       "      <td>0.957143</td>\n",
       "      <td>0.943662</td>\n",
       "      <td>0.971014</td>\n",
       "      <td>71</td>\n",
       "      <td>0.985075</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.970588</td>\n",
       "      <td>33</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table><p>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens, ids, ner_tags. If tokens, ids, ner_tags are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.\n",
      "***** Running Evaluation *****\n",
      "  Num examples = 58\n",
      "  Batch size = 32\n",
      "/mounts/work/akoksal/anaconda3/envs/lmbias/lib/python3.9/site-packages/seqeval/metrics/v1.py:57: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
      "  _warn_prf(average, modifier, msg_start, len(result))\n",
      "Saving model checkpoint to /mounts/work/akoksal/earthquake_ner_models/checkpoint-50\n",
      "Configuration saved in /mounts/work/akoksal/earthquake_ner_models/checkpoint-50/config.json\n",
      "Model weights saved in /mounts/work/akoksal/earthquake_ner_models/checkpoint-50/pytorch_model.bin\n",
      "tokenizer config file saved in /mounts/work/akoksal/earthquake_ner_models/checkpoint-50/tokenizer_config.json\n",
      "Special tokens file saved in /mounts/work/akoksal/earthquake_ner_models/checkpoint-50/special_tokens_map.json\n",
      "The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens, ids, ner_tags. If tokens, ids, ner_tags are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.\n",
      "***** Running Evaluation *****\n",
      "  Num examples = 58\n",
      "  Batch size = 32\n",
      "Saving model checkpoint to /mounts/work/akoksal/earthquake_ner_models/checkpoint-100\n",
      "Configuration saved in /mounts/work/akoksal/earthquake_ner_models/checkpoint-100/config.json\n",
      "Model weights saved in /mounts/work/akoksal/earthquake_ner_models/checkpoint-100/pytorch_model.bin\n",
      "tokenizer config file saved in /mounts/work/akoksal/earthquake_ner_models/checkpoint-100/tokenizer_config.json\n",
      "Special tokens file saved in /mounts/work/akoksal/earthquake_ner_models/checkpoint-100/special_tokens_map.json\n",
      "The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens, ids, ner_tags. If tokens, ids, ner_tags are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.\n",
      "***** Running Evaluation *****\n",
      "  Num examples = 58\n",
      "  Batch size = 32\n",
      "Saving model checkpoint to /mounts/work/akoksal/earthquake_ner_models/checkpoint-150\n",
      "Configuration saved in /mounts/work/akoksal/earthquake_ner_models/checkpoint-150/config.json\n",
      "Model weights saved in /mounts/work/akoksal/earthquake_ner_models/checkpoint-150/pytorch_model.bin\n",
      "tokenizer config file saved in /mounts/work/akoksal/earthquake_ner_models/checkpoint-150/tokenizer_config.json\n",
      "Special tokens file saved in /mounts/work/akoksal/earthquake_ner_models/checkpoint-150/special_tokens_map.json\n",
      "The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens, ids, ner_tags. If tokens, ids, ner_tags are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.\n",
      "***** Running Evaluation *****\n",
      "  Num examples = 58\n",
      "  Batch size = 32\n",
      "Saving model checkpoint to /mounts/work/akoksal/earthquake_ner_models/checkpoint-200\n",
      "Configuration saved in /mounts/work/akoksal/earthquake_ner_models/checkpoint-200/config.json\n",
      "Model weights saved in /mounts/work/akoksal/earthquake_ner_models/checkpoint-200/pytorch_model.bin\n",
      "tokenizer config file saved in /mounts/work/akoksal/earthquake_ner_models/checkpoint-200/tokenizer_config.json\n",
      "Special tokens file saved in /mounts/work/akoksal/earthquake_ner_models/checkpoint-200/special_tokens_map.json\n",
      "Deleting older checkpoint [/mounts/work/akoksal/earthquake_ner_models/checkpoint-50] due to args.save_total_limit\n",
      "The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens, ids, ner_tags. If tokens, ids, ner_tags are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.\n",
      "***** Running Evaluation *****\n",
      "  Num examples = 58\n",
      "  Batch size = 32\n",
      "Saving model checkpoint to /mounts/work/akoksal/earthquake_ner_models/checkpoint-250\n",
      "Configuration saved in /mounts/work/akoksal/earthquake_ner_models/checkpoint-250/config.json\n",
      "Model weights saved in /mounts/work/akoksal/earthquake_ner_models/checkpoint-250/pytorch_model.bin\n",
      "tokenizer config file saved in /mounts/work/akoksal/earthquake_ner_models/checkpoint-250/tokenizer_config.json\n",
      "Special tokens file saved in /mounts/work/akoksal/earthquake_ner_models/checkpoint-250/special_tokens_map.json\n",
      "Deleting older checkpoint [/mounts/work/akoksal/earthquake_ner_models/checkpoint-100] due to args.save_total_limit\n",
      "\n",
      "\n",
      "Training completed. Do not forget to share your model on huggingface.co/models =)\n",
      "\n",
      "\n",
      "Loading best model from /mounts/work/akoksal/earthquake_ner_models/checkpoint-200 (score: 0.21532948315143585).\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "TrainOutput(global_step=250, training_loss=0.3963502960205078, metrics={'train_runtime': 73.0701, 'train_samples_per_second': 54.674, 'train_steps_per_second': 3.421, 'total_flos': 129863927953500.0, 'train_loss': 0.3963502960205078, 'epoch': 5.0})"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "trainer.train()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "4427c32d",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens, ids, ner_tags. If tokens, ids, ner_tags are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.\n",
      "***** Running Evaluation *****\n",
      "  Num examples = 129\n",
      "  Batch size = 32\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "\n",
       "    <div>\n",
       "      \n",
       "      <progress value='5' max='5' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
       "      [5/5 00:00]\n",
       "    </div>\n",
       "    "
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "results = trainer.evaluate(tokenized_dataset[\"test\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "aabbb977",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'eval_loss': 0.24822480976581573,\n",
       " 'eval_overall_precision': 0.8442211055276382,\n",
       " 'eval_overall_recall': 0.877742946708464,\n",
       " 'eval_overall_f1': 0.860655737704918,\n",
       " 'eval_overall_accuracy': 0.9401853411962932,\n",
       " 'eval_bina_f1': 0.7000000000000001,\n",
       " 'eval_bina_recall': 0.7424242424242424,\n",
       " 'eval_bina_precision': 0.6621621621621622,\n",
       " 'eval_bina_support': 66,\n",
       " 'eval_bulvar_f1': 0.9230769230769231,\n",
       " 'eval_bulvar_recall': 0.9230769230769231,\n",
       " 'eval_bulvar_precision': 0.9230769230769231,\n",
       " 'eval_bulvar_support': 13,\n",
       " 'eval_cadde_f1': 0.8067226890756302,\n",
       " 'eval_cadde_recall': 0.8421052631578947,\n",
       " 'eval_cadde_precision': 0.7741935483870968,\n",
       " 'eval_cadde_support': 57,\n",
       " 'eval_diskapino_f1': 0.7083333333333334,\n",
       " 'eval_diskapino_recall': 0.7285714285714285,\n",
       " 'eval_diskapino_precision': 0.6891891891891891,\n",
       " 'eval_diskapino_support': 70,\n",
       " 'eval_ilce_f1': 0.9218106995884773,\n",
       " 'eval_ilce_recall': 0.9572649572649573,\n",
       " 'eval_ilce_precision': 0.8888888888888888,\n",
       " 'eval_ilce_support': 117,\n",
       " 'eval_isim_f1': 0.8793103448275862,\n",
       " 'eval_isim_recall': 0.9026548672566371,\n",
       " 'eval_isim_precision': 0.8571428571428571,\n",
       " 'eval_isim_support': 113,\n",
       " 'eval_mahalle_f1': 0.7903225806451613,\n",
       " 'eval_mahalle_recall': 0.8166666666666667,\n",
       " 'eval_mahalle_precision': 0.765625,\n",
       " 'eval_mahalle_support': 120,\n",
       " 'eval_sehir_f1': 0.9724137931034483,\n",
       " 'eval_sehir_recall': 0.9657534246575342,\n",
       " 'eval_sehir_precision': 0.9791666666666666,\n",
       " 'eval_sehir_support': 146,\n",
       " 'eval_site_f1': 0.6875000000000001,\n",
       " 'eval_site_recall': 0.6111111111111112,\n",
       " 'eval_site_precision': 0.7857142857142857,\n",
       " 'eval_site_support': 18,\n",
       " 'eval_sokak_f1': 0.7301587301587302,\n",
       " 'eval_sokak_recall': 0.7419354838709677,\n",
       " 'eval_sokak_precision': 0.71875,\n",
       " 'eval_sokak_support': 62,\n",
       " 'eval_soyisim_f1': 0.9441624365482234,\n",
       " 'eval_soyisim_recall': 0.9489795918367347,\n",
       " 'eval_soyisim_precision': 0.9393939393939394,\n",
       " 'eval_soyisim_support': 98,\n",
       " 'eval_telefonno_f1': 0.9935483870967742,\n",
       " 'eval_telefonno_recall': 1.0,\n",
       " 'eval_telefonno_precision': 0.9871794871794872,\n",
       " 'eval_telefonno_support': 77,\n",
       " 'eval_runtime': 0.3493,\n",
       " 'eval_samples_per_second': 369.308,\n",
       " 'eval_steps_per_second': 14.314,\n",
       " 'epoch': 5.0}"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "922a7237",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>support</th>\n",
       "      <th>precision</th>\n",
       "      <th>recall</th>\n",
       "      <th>f1</th>\n",
       "      <th>accuracy</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>overall</th>\n",
       "      <td>957</td>\n",
       "      <td>0.84</td>\n",
       "      <td>0.88</td>\n",
       "      <td>0.86</td>\n",
       "      <td>0.94</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>bina</th>\n",
       "      <td>66</td>\n",
       "      <td>0.66</td>\n",
       "      <td>0.74</td>\n",
       "      <td>0.70</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>bulvar</th>\n",
       "      <td>13</td>\n",
       "      <td>0.92</td>\n",
       "      <td>0.92</td>\n",
       "      <td>0.92</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>cadde</th>\n",
       "      <td>57</td>\n",
       "      <td>0.77</td>\n",
       "      <td>0.84</td>\n",
       "      <td>0.81</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>diskapino</th>\n",
       "      <td>70</td>\n",
       "      <td>0.69</td>\n",
       "      <td>0.73</td>\n",
       "      <td>0.71</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>ilce</th>\n",
       "      <td>117</td>\n",
       "      <td>0.89</td>\n",
       "      <td>0.96</td>\n",
       "      <td>0.92</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>isim</th>\n",
       "      <td>113</td>\n",
       "      <td>0.86</td>\n",
       "      <td>0.90</td>\n",
       "      <td>0.88</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mahalle</th>\n",
       "      <td>120</td>\n",
       "      <td>0.77</td>\n",
       "      <td>0.82</td>\n",
       "      <td>0.79</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>sehir</th>\n",
       "      <td>146</td>\n",
       "      <td>0.98</td>\n",
       "      <td>0.97</td>\n",
       "      <td>0.97</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>site</th>\n",
       "      <td>18</td>\n",
       "      <td>0.79</td>\n",
       "      <td>0.61</td>\n",
       "      <td>0.69</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>sokak</th>\n",
       "      <td>62</td>\n",
       "      <td>0.72</td>\n",
       "      <td>0.74</td>\n",
       "      <td>0.73</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>soyisim</th>\n",
       "      <td>98</td>\n",
       "      <td>0.94</td>\n",
       "      <td>0.95</td>\n",
       "      <td>0.94</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>telefonno</th>\n",
       "      <td>77</td>\n",
       "      <td>0.99</td>\n",
       "      <td>1.00</td>\n",
       "      <td>0.99</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "           support  precision  recall    f1  accuracy\n",
       "overall        957       0.84    0.88  0.86      0.94\n",
       "bina            66       0.66    0.74  0.70       NaN\n",
       "bulvar          13       0.92    0.92  0.92       NaN\n",
       "cadde           57       0.77    0.84  0.81       NaN\n",
       "diskapino       70       0.69    0.73  0.71       NaN\n",
       "ilce           117       0.89    0.96  0.92       NaN\n",
       "isim           113       0.86    0.90  0.88       NaN\n",
       "mahalle        120       0.77    0.82  0.79       NaN\n",
       "sehir          146       0.98    0.97  0.97       NaN\n",
       "site            18       0.79    0.61  0.69       NaN\n",
       "sokak           62       0.72    0.74  0.73       NaN\n",
       "soyisim         98       0.94    0.95  0.94       NaN\n",
       "telefonno       77       0.99    1.00  0.99       NaN"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "structured_results = defaultdict(dict)\n",
    "structured_results[\"overall\"][\"support\"]=0\n",
    "for x, y in results.items():\n",
    "    if len(x.split(\"_\"))==3:\n",
    "        structured_results[x.split(\"_\")[1]][x.split(\"_\")[2]] = y\n",
    "        if x.split(\"_\")[2]==\"support\":\n",
    "            structured_results[\"overall\"][\"support\"]+=y\n",
    "results_pd = pd.DataFrame(structured_results).T\n",
    "results_pd.support = results_pd.support.astype(int)\n",
    "results_pd.round(2)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3c3de283",
   "metadata": {},
   "source": [
    "## Predictions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "ed165edb",
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import pipeline\n",
    "nlp = pipeline(\"ner\", model=model.to(device), tokenizer=tokenizer, aggregation_strategy=\"first\", device=0 if device==\"cuda\" else -1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "0e350503",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Source: https://www.thepythoncode.com/article/named-entity-recognition-using-transformers-and-spacy\n",
    "def get_entities_html(text, ner_result, title=None):\n",
    "    \"\"\"Visualize NER with the help of SpaCy\"\"\"\n",
    "    ents = []\n",
    "    for ent in ner_result:\n",
    "        e = {}\n",
    "        # add the start and end positions of the entity\n",
    "        e[\"start\"] = ent[\"start\"]\n",
    "        e[\"end\"] = ent[\"end\"]\n",
    "        # add the score if you want in the label\n",
    "        # e[\"label\"] = f\"{ent[\"entity\"]}-{ent['score']:.2f}\"\n",
    "        e[\"label\"] = ent[\"entity_group\"]\n",
    "        if ents and -1 <= ent[\"start\"] - ents[-1][\"end\"] <= 1 and ents[-1][\"label\"] == e[\"label\"]:\n",
    "            # if the current entity is shared with previous entity\n",
    "            # simply extend the entity end position instead of adding a new one\n",
    "            ents[-1][\"end\"] = e[\"end\"]\n",
    "            continue\n",
    "        ents.append(e)\n",
    "      # construct data required for displacy.render() method\n",
    "    render_data = [\n",
    "    {\n",
    "      \"text\": text,\n",
    "      \"ents\": ents,\n",
    "      \"title\": title,\n",
    "    }\n",
    "    ]\n",
    "    spacy.displacy.render(render_data, style=\"ent\", manual=True, jupyter=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "f98a6902",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<span class=\"tex2jax_ignore\"><div class=\"entities\" style=\"line-height: 2.5; direction: ltr\">Lütfen yardım \n",
       "<mark class=\"entity\" style=\"background: #ddd; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
       "    Akevler\n",
       "    <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem\">mahalle</span>\n",
       "</mark>\n",
       " mahallesi \n",
       "<mark class=\"entity\" style=\"background: #ddd; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
       "    Rüzgar\n",
       "    <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem\">sokak</span>\n",
       "</mark>\n",
       " sokak \n",
       "<mark class=\"entity\" style=\"background: #ddd; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
       "    Tuncay\n",
       "    <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem\">bina</span>\n",
       "</mark>\n",
       " apartmanı zemin kat \n",
       "<mark class=\"entity\" style=\"background: #ddd; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
       "    Antakya\n",
       "    <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem\">ilce</span>\n",
       "</mark>\n",
       " akrabalarım göçük altında #hatay #Afad</div></span>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "sentence = \"\"\"Lütfen yardım Akevler mahallesi Rüzgar sokak Tuncay apartmanı zemin kat Antakya akrabalarım göçük altında #hatay #Afad\"\"\"\n",
    "\n",
    "get_entities_html(sentence, nlp(sentence))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "80b823ff",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<span class=\"tex2jax_ignore\"><div class=\"entities\" style=\"line-height: 2.5; direction: ltr\">\n",
       "<mark class=\"entity\" style=\"background: #ddd; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
       "    Kahramanmaraş\n",
       "    <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem\">sehir</span>\n",
       "</mark>\n",
       " \n",
       "<mark class=\"entity\" style=\"background: #ddd; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
       "    merkez\n",
       "    <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem\">ilce</span>\n",
       "</mark>\n",
       " \n",
       "<mark class=\"entity\" style=\"background: #ddd; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
       "    Şazibey\n",
       "    <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem\">mahalle</span>\n",
       "</mark>\n",
       " Mahallesi \n",
       "<mark class=\"entity\" style=\"background: #ddd; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
       "    Ebrar\n",
       "    <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem\">site</span>\n",
       "</mark>\n",
       " Sitesi \n",
       "<mark class=\"entity\" style=\"background: #ddd; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
       "    Z\n",
       "    <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem\">bina</span>\n",
       "</mark>\n",
       " blok arka tarafı için acil en az 150 tonluk vinç lazım lütfen paylaşır mısınız</div></span>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "sentence = \" \".join(dataset[\"train\"][433][\"tokens\"])\n",
    "get_entities_html(sentence, nlp(sentence))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}