{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "c88f989c", "metadata": {}, "outputs": [], "source": [ "import os\n", "os.environ['CUDA_VISIBLE_DEVICES']='7'" ] }, { "cell_type": "code", "execution_count": 2, "id": "bfdbe247", "metadata": { "scrolled": true }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "2023-02-26 02:35:07.275938: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA\n", "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", "2023-02-26 02:35:07.472394: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory\n", "2023-02-26 02:35:07.472434: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.\n", "2023-02-26 02:35:07.503598: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n", "2023-02-26 02:35:08.603575: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory\n", "2023-02-26 02:35:08.603678: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory\n", "2023-02-26 02:35:08.603689: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.\n", "2023-02-26 02:35:15.326595: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory\n", "2023-02-26 02:35:15.326728: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublas.so.11'; dlerror: libcublas.so.11: cannot open shared object file: No such file or directory\n", "2023-02-26 02:35:15.326831: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublasLt.so.11'; dlerror: libcublasLt.so.11: cannot open shared object file: No such file or directory\n", "2023-02-26 02:35:15.327013: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcusolver.so.11'; dlerror: libcusolver.so.11: cannot open shared object file: No such file or directory\n", "2023-02-26 02:35:15.327108: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcusparse.so.11'; dlerror: libcusparse.so.11: cannot open shared object file: No such file or directory\n", "2023-02-26 02:35:15.327205: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudnn.so.8'; dlerror: libcudnn.so.8: cannot open shared object file: No such file or directory\n", "2023-02-26 02:35:15.327224: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1934] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.\n", "Skipping registering GPU devices...\n" ] } ], "source": [ "from transformers import AutoTokenizer\n", "import re\n", "import numpy as np\n", "from random import Random\n", "import torch\n", "import pandas as pd\n", "import spacy\n", "import random\n", "from datasets import load_dataset\n", "from transformers import (\n", " AutoModelForTokenClassification,\n", " AutoTokenizer,\n", " DataCollatorForTokenClassification,\n", " TrainingArguments,\n", " Trainer,\n", " set_seed)\n", "import numpy as np\n", "import datasets\n", "from collections import defaultdict\n", "from datasets import load_metric" ] }, { "cell_type": "code", "execution_count": 3, "id": "7a916e9f", "metadata": {}, "outputs": [], "source": [ "# !pip install seqeval" ] }, { "cell_type": "code", "execution_count": 4, "id": "4b0590b7", "metadata": {}, "outputs": [], "source": [ "per_device_train_batch_size = 16\n", "per_device_eval_batch_size = 32\n", "num_train_epochs = 5\n", "weight_decay = 0.1\n", "warmup_ratio = 0.1\n", "learning_rate = 5e-5\n", "load_best_model_at_end = True\n", "output_dir = \"../akoksal/earthquake_ner_models/\"\n", "old_data_path = \"annotated_address_dataset_07022023_766train_192test/\"\n", "data_path = \"deprem-private/ner_v12\"\n", "cache_dir = \"../akoksal/hf_cache\"\n", "saved_models_path = \"../akoksal/earthquake_ner_models/\"\n", "device = \"cuda\"\n", "seed = 42\n", "model_names = [\"dbmdz/bert-base-turkish-cased\",\n", " \"dbmdz/electra-base-turkish-mc4-cased-discriminator\",\n", " \"dbmdz/bert-base-turkish-128k-cased\",\n", " \"dbmdz/convbert-base-turkish-cased\",\n", " \"bert-base-multilingual-cased\",\n", " \"xlm-roberta-base\"]\n", "model_name = model_names[2]" ] }, { "cell_type": "code", "execution_count": 5, "id": "9aeb3dbe", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'dbmdz/bert-base-turkish-128k-cased'" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model_name" ] }, { "cell_type": "code", "execution_count": 6, "id": "ffeb73e4", "metadata": {}, "outputs": [], "source": [ "set_seed(seed)" ] }, { "cell_type": "code", "execution_count": 7, "id": "a876c516", "metadata": {}, "outputs": [], "source": [ "id2label = {\n", " 0: \"O\",\n", " 1: \"B-bina\",\n", " 2: \"I-bina\",\n", " 3: \"B-bulvar\",\n", " 4: \"I-bulvar\",\n", " 5: \"B-cadde\",\n", " 6: \"I-cadde\",\n", " 7: \"B-diskapino\",\n", " 8: \"I-diskapino\",\n", " 9: \"B-ilce\",\n", " 10: \"I-ilce\",\n", " 11: \"B-isim\",\n", " 12: \"I-isim\",\n", " 13: \"B-mahalle\",\n", " 14: \"I-mahalle\",\n", " 15: \"B-sehir\",\n", " 16: \"I-sehir\",\n", " 17: \"B-site\",\n", " 18: \"I-site\",\n", " 19: \"B-sokak\",\n", " 20: \"I-sokak\",\n", " 21: \"B-soyisim\",\n", " 22: \"I-soyisim\",\n", " 23: \"B-telefonno\",\n", " 24: \"I-telefonno\",\n", "}\n", "\n", "label2id = {label: idx for idx, label in id2label.items()}\n", "label_names = list(label2id.keys())" ] }, { "cell_type": "code", "execution_count": 8, "id": "2e0caffc", "metadata": {}, "outputs": [], "source": [ "# from huggingface_hub import login\n", "# login()" ] }, { "cell_type": "code", "execution_count": 9, "id": "c74850f9", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Some weights of the model checkpoint at dbmdz/bert-base-turkish-128k-cased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']\n", "- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", "- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n", "Some weights of BertForTokenClassification were not initialized from the model checkpoint at dbmdz/bert-base-turkish-128k-cased and are newly initialized: ['classifier.bias', 'classifier.weight']\n", "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" ] } ], "source": [ "tokenizer = AutoTokenizer.from_pretrained(model_name)\n", "model = AutoModelForTokenClassification.from_pretrained(model_name,\n", " num_labels=len(label_names),\n", " id2label=id2label,\n", " cache_dir=cache_dir).to(device)" ] }, { "cell_type": "code", "execution_count": 10, "id": "4c1fe653", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Using custom data configuration deprem-private--ner_v12-e2f61c5a18a7a738\n", "Found cached dataset text (/mounts/Users/cisintern/akoksal/.cache/huggingface/datasets/deprem-private___text/deprem-private--ner_v12-e2f61c5a18a7a738/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2)\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "22bc5f5f97204b41b2bc5dc3b71036e1", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/3 [00:00\n", " \n", " \n", " [250/250 01:12, Epoch 5/5]\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
EpochTraining LossValidation LossOverall PrecisionOverall RecallOverall F1Overall AccuracyBina F1Bina RecallBina PrecisionBina SupportBulvar F1Bulvar RecallBulvar PrecisionBulvar SupportCadde F1Cadde RecallCadde PrecisionCadde SupportDiskapino F1Diskapino RecallDiskapino PrecisionDiskapino SupportIlce F1Ilce RecallIlce PrecisionIlce SupportIsim F1Isim RecallIsim PrecisionIsim SupportMahalle F1Mahalle RecallMahalle PrecisionMahalle SupportSehir F1Sehir RecallSehir PrecisionSehir SupportSite F1Site RecallSite PrecisionSite SupportSokak F1Sokak RecallSokak PrecisionSokak SupportSoyisim F1Soyisim RecallSoyisim PrecisionSoyisim SupportTelefonno F1Telefonno RecallTelefonno PrecisionTelefonno Support
11.3495000.3573210.7832700.8289740.8054740.9089360.6000000.7058820.521739340.0000000.0000000.00000050.5882350.8333330.454545240.7692310.8928570.675676280.8305080.8166670.844828600.8888890.9268290.853933820.7500000.7924530.711864530.8671330.8611110.873239720.0000000.0000000.00000060.7500000.6206900.947368290.9000000.8873240.913043710.9850751.0000000.97058833
20.2647000.2204670.8851490.8993960.8922160.9447920.7826090.7941180.771429340.6666670.8000000.57142950.8750000.8750000.875000240.8620690.8928570.833333280.8943090.9166670.873016600.8848480.8902440.879518820.8971960.9056600.888889530.9154930.9027780.928571720.1818180.1666670.20000060.9491530.9655170.933333290.9503550.9436620.957143710.9850751.0000000.97058833
30.1587000.2195650.8767680.8732390.8750000.9408080.8055560.8529410.763158340.6666671.0000000.50000050.8800000.9166670.846154240.8275860.8571430.800000280.8813560.8666670.896552600.8227850.7926830.855263820.8867920.8867920.886792530.8920860.8611110.925373720.4000000.3333330.50000060.8813560.8965520.866667290.9571430.9436620.971014710.9850751.0000000.97058833
40.1150000.2153290.8975410.8812880.8893400.9465000.8571430.8823530.833333340.9090911.0000000.83333350.8979590.9166670.880000240.8620690.8928570.833333280.8813560.8666670.896552600.8101270.7804880.842105820.8867920.8867920.886792530.8905110.8472220.938462720.7272730.6666670.80000060.9508201.0000000.906250290.9496400.9295770.970588710.9850751.0000000.97058833
50.0938000.2315580.8954920.8792760.8873100.9453610.8333330.8823530.789474340.9090911.0000000.83333350.8800000.9166670.846154240.8135590.8571430.774194280.8888890.8666670.912281600.8333330.7926830.878378820.8952380.8867920.903846530.8985510.8611110.939394720.7272730.6666670.80000060.8813560.8965520.866667290.9571430.9436620.971014710.9850751.0000000.97058833

" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stderr", "output_type": "stream", "text": [ "The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens, ids, ner_tags. If tokens, ids, ner_tags are not expected by `BertForTokenClassification.forward`, you can safely ignore this message.\n", "***** Running Evaluation *****\n", " Num examples = 58\n", " Batch size = 32\n", "/mounts/work/akoksal/anaconda3/envs/lmbias/lib/python3.9/site-packages/seqeval/metrics/v1.py:57: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", " _warn_prf(average, modifier, msg_start, len(result))\n", "Saving model checkpoint to /mounts/work/akoksal/earthquake_ner_models/checkpoint-50\n", "Configuration saved in /mounts/work/akoksal/earthquake_ner_models/checkpoint-50/config.json\n", "Model weights saved in /mounts/work/akoksal/earthquake_ner_models/checkpoint-50/pytorch_model.bin\n", "tokenizer config file saved in /mounts/work/akoksal/earthquake_ner_models/checkpoint-50/tokenizer_config.json\n", "Special tokens file saved in /mounts/work/akoksal/earthquake_ner_models/checkpoint-50/special_tokens_map.json\n", "The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens, ids, ner_tags. If tokens, ids, ner_tags are not expected by `BertForTokenClassification.forward`, you can safely ignore this message.\n", "***** Running Evaluation *****\n", " Num examples = 58\n", " Batch size = 32\n", "Saving model checkpoint to /mounts/work/akoksal/earthquake_ner_models/checkpoint-100\n", "Configuration saved in /mounts/work/akoksal/earthquake_ner_models/checkpoint-100/config.json\n", "Model weights saved in /mounts/work/akoksal/earthquake_ner_models/checkpoint-100/pytorch_model.bin\n", "tokenizer config file saved in /mounts/work/akoksal/earthquake_ner_models/checkpoint-100/tokenizer_config.json\n", "Special tokens file saved in /mounts/work/akoksal/earthquake_ner_models/checkpoint-100/special_tokens_map.json\n", "The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens, ids, ner_tags. If tokens, ids, ner_tags are not expected by `BertForTokenClassification.forward`, you can safely ignore this message.\n", "***** Running Evaluation *****\n", " Num examples = 58\n", " Batch size = 32\n", "Saving model checkpoint to /mounts/work/akoksal/earthquake_ner_models/checkpoint-150\n", "Configuration saved in /mounts/work/akoksal/earthquake_ner_models/checkpoint-150/config.json\n", "Model weights saved in /mounts/work/akoksal/earthquake_ner_models/checkpoint-150/pytorch_model.bin\n", "tokenizer config file saved in /mounts/work/akoksal/earthquake_ner_models/checkpoint-150/tokenizer_config.json\n", "Special tokens file saved in /mounts/work/akoksal/earthquake_ner_models/checkpoint-150/special_tokens_map.json\n", "The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens, ids, ner_tags. If tokens, ids, ner_tags are not expected by `BertForTokenClassification.forward`, you can safely ignore this message.\n", "***** Running Evaluation *****\n", " Num examples = 58\n", " Batch size = 32\n", "Saving model checkpoint to /mounts/work/akoksal/earthquake_ner_models/checkpoint-200\n", "Configuration saved in /mounts/work/akoksal/earthquake_ner_models/checkpoint-200/config.json\n", "Model weights saved in /mounts/work/akoksal/earthquake_ner_models/checkpoint-200/pytorch_model.bin\n", "tokenizer config file saved in /mounts/work/akoksal/earthquake_ner_models/checkpoint-200/tokenizer_config.json\n", "Special tokens file saved in /mounts/work/akoksal/earthquake_ner_models/checkpoint-200/special_tokens_map.json\n", "Deleting older checkpoint [/mounts/work/akoksal/earthquake_ner_models/checkpoint-50] due to args.save_total_limit\n", "The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens, ids, ner_tags. If tokens, ids, ner_tags are not expected by `BertForTokenClassification.forward`, you can safely ignore this message.\n", "***** Running Evaluation *****\n", " Num examples = 58\n", " Batch size = 32\n", "Saving model checkpoint to /mounts/work/akoksal/earthquake_ner_models/checkpoint-250\n", "Configuration saved in /mounts/work/akoksal/earthquake_ner_models/checkpoint-250/config.json\n", "Model weights saved in /mounts/work/akoksal/earthquake_ner_models/checkpoint-250/pytorch_model.bin\n", "tokenizer config file saved in /mounts/work/akoksal/earthquake_ner_models/checkpoint-250/tokenizer_config.json\n", "Special tokens file saved in /mounts/work/akoksal/earthquake_ner_models/checkpoint-250/special_tokens_map.json\n", "Deleting older checkpoint [/mounts/work/akoksal/earthquake_ner_models/checkpoint-100] due to args.save_total_limit\n", "\n", "\n", "Training completed. Do not forget to share your model on huggingface.co/models =)\n", "\n", "\n", "Loading best model from /mounts/work/akoksal/earthquake_ner_models/checkpoint-200 (score: 0.21532948315143585).\n" ] }, { "data": { "text/plain": [ "TrainOutput(global_step=250, training_loss=0.3963502960205078, metrics={'train_runtime': 73.0701, 'train_samples_per_second': 54.674, 'train_steps_per_second': 3.421, 'total_flos': 129863927953500.0, 'train_loss': 0.3963502960205078, 'epoch': 5.0})" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "trainer.train()" ] }, { "cell_type": "code", "execution_count": 16, "id": "4427c32d", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens, ids, ner_tags. If tokens, ids, ner_tags are not expected by `BertForTokenClassification.forward`, you can safely ignore this message.\n", "***** Running Evaluation *****\n", " Num examples = 129\n", " Batch size = 32\n" ] }, { "data": { "text/html": [ "\n", "

\n", " \n", " \n", " [5/5 00:00]\n", "
\n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "results = trainer.evaluate(tokenized_dataset[\"test\"])" ] }, { "cell_type": "code", "execution_count": 24, "id": "aabbb977", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'eval_loss': 0.24822480976581573,\n", " 'eval_overall_precision': 0.8442211055276382,\n", " 'eval_overall_recall': 0.877742946708464,\n", " 'eval_overall_f1': 0.860655737704918,\n", " 'eval_overall_accuracy': 0.9401853411962932,\n", " 'eval_bina_f1': 0.7000000000000001,\n", " 'eval_bina_recall': 0.7424242424242424,\n", " 'eval_bina_precision': 0.6621621621621622,\n", " 'eval_bina_support': 66,\n", " 'eval_bulvar_f1': 0.9230769230769231,\n", " 'eval_bulvar_recall': 0.9230769230769231,\n", " 'eval_bulvar_precision': 0.9230769230769231,\n", " 'eval_bulvar_support': 13,\n", " 'eval_cadde_f1': 0.8067226890756302,\n", " 'eval_cadde_recall': 0.8421052631578947,\n", " 'eval_cadde_precision': 0.7741935483870968,\n", " 'eval_cadde_support': 57,\n", " 'eval_diskapino_f1': 0.7083333333333334,\n", " 'eval_diskapino_recall': 0.7285714285714285,\n", " 'eval_diskapino_precision': 0.6891891891891891,\n", " 'eval_diskapino_support': 70,\n", " 'eval_ilce_f1': 0.9218106995884773,\n", " 'eval_ilce_recall': 0.9572649572649573,\n", " 'eval_ilce_precision': 0.8888888888888888,\n", " 'eval_ilce_support': 117,\n", " 'eval_isim_f1': 0.8793103448275862,\n", " 'eval_isim_recall': 0.9026548672566371,\n", " 'eval_isim_precision': 0.8571428571428571,\n", " 'eval_isim_support': 113,\n", " 'eval_mahalle_f1': 0.7903225806451613,\n", " 'eval_mahalle_recall': 0.8166666666666667,\n", " 'eval_mahalle_precision': 0.765625,\n", " 'eval_mahalle_support': 120,\n", " 'eval_sehir_f1': 0.9724137931034483,\n", " 'eval_sehir_recall': 0.9657534246575342,\n", " 'eval_sehir_precision': 0.9791666666666666,\n", " 'eval_sehir_support': 146,\n", " 'eval_site_f1': 0.6875000000000001,\n", " 'eval_site_recall': 0.6111111111111112,\n", " 'eval_site_precision': 0.7857142857142857,\n", " 'eval_site_support': 18,\n", " 'eval_sokak_f1': 0.7301587301587302,\n", " 'eval_sokak_recall': 0.7419354838709677,\n", " 'eval_sokak_precision': 0.71875,\n", " 'eval_sokak_support': 62,\n", " 'eval_soyisim_f1': 0.9441624365482234,\n", " 'eval_soyisim_recall': 0.9489795918367347,\n", " 'eval_soyisim_precision': 0.9393939393939394,\n", " 'eval_soyisim_support': 98,\n", " 'eval_telefonno_f1': 0.9935483870967742,\n", " 'eval_telefonno_recall': 1.0,\n", " 'eval_telefonno_precision': 0.9871794871794872,\n", " 'eval_telefonno_support': 77,\n", " 'eval_runtime': 0.3493,\n", " 'eval_samples_per_second': 369.308,\n", " 'eval_steps_per_second': 14.314,\n", " 'epoch': 5.0}" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "results" ] }, { "cell_type": "code", "execution_count": 18, "id": "922a7237", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
supportprecisionrecallf1accuracy
overall9570.840.880.860.94
bina660.660.740.70NaN
bulvar130.920.920.92NaN
cadde570.770.840.81NaN
diskapino700.690.730.71NaN
ilce1170.890.960.92NaN
isim1130.860.900.88NaN
mahalle1200.770.820.79NaN
sehir1460.980.970.97NaN
site180.790.610.69NaN
sokak620.720.740.73NaN
soyisim980.940.950.94NaN
telefonno770.991.000.99NaN
\n", "
" ], "text/plain": [ " support precision recall f1 accuracy\n", "overall 957 0.84 0.88 0.86 0.94\n", "bina 66 0.66 0.74 0.70 NaN\n", "bulvar 13 0.92 0.92 0.92 NaN\n", "cadde 57 0.77 0.84 0.81 NaN\n", "diskapino 70 0.69 0.73 0.71 NaN\n", "ilce 117 0.89 0.96 0.92 NaN\n", "isim 113 0.86 0.90 0.88 NaN\n", "mahalle 120 0.77 0.82 0.79 NaN\n", "sehir 146 0.98 0.97 0.97 NaN\n", "site 18 0.79 0.61 0.69 NaN\n", "sokak 62 0.72 0.74 0.73 NaN\n", "soyisim 98 0.94 0.95 0.94 NaN\n", "telefonno 77 0.99 1.00 0.99 NaN" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "structured_results = defaultdict(dict)\n", "structured_results[\"overall\"][\"support\"]=0\n", "for x, y in results.items():\n", " if len(x.split(\"_\"))==3:\n", " structured_results[x.split(\"_\")[1]][x.split(\"_\")[2]] = y\n", " if x.split(\"_\")[2]==\"support\":\n", " structured_results[\"overall\"][\"support\"]+=y\n", "results_pd = pd.DataFrame(structured_results).T\n", "results_pd.support = results_pd.support.astype(int)\n", "results_pd.round(2)" ] }, { "cell_type": "markdown", "id": "3c3de283", "metadata": {}, "source": [ "## Predictions" ] }, { "cell_type": "code", "execution_count": 19, "id": "ed165edb", "metadata": {}, "outputs": [], "source": [ "from transformers import pipeline\n", "nlp = pipeline(\"ner\", model=model.to(device), tokenizer=tokenizer, aggregation_strategy=\"first\", device=0 if device==\"cuda\" else -1)" ] }, { "cell_type": "code", "execution_count": 20, "id": "0e350503", "metadata": {}, "outputs": [], "source": [ "# Source: https://www.thepythoncode.com/article/named-entity-recognition-using-transformers-and-spacy\n", "def get_entities_html(text, ner_result, title=None):\n", " \"\"\"Visualize NER with the help of SpaCy\"\"\"\n", " ents = []\n", " for ent in ner_result:\n", " e = {}\n", " # add the start and end positions of the entity\n", " e[\"start\"] = ent[\"start\"]\n", " e[\"end\"] = ent[\"end\"]\n", " # add the score if you want in the label\n", " # e[\"label\"] = f\"{ent[\"entity\"]}-{ent['score']:.2f}\"\n", " e[\"label\"] = ent[\"entity_group\"]\n", " if ents and -1 <= ent[\"start\"] - ents[-1][\"end\"] <= 1 and ents[-1][\"label\"] == e[\"label\"]:\n", " # if the current entity is shared with previous entity\n", " # simply extend the entity end position instead of adding a new one\n", " ents[-1][\"end\"] = e[\"end\"]\n", " continue\n", " ents.append(e)\n", " # construct data required for displacy.render() method\n", " render_data = [\n", " {\n", " \"text\": text,\n", " \"ents\": ents,\n", " \"title\": title,\n", " }\n", " ]\n", " spacy.displacy.render(render_data, style=\"ent\", manual=True, jupyter=True)" ] }, { "cell_type": "code", "execution_count": 21, "id": "f98a6902", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
Lütfen yardım \n", "\n", " Akevler\n", " mahalle\n", "\n", " mahallesi \n", "\n", " Rüzgar\n", " sokak\n", "\n", " sokak \n", "\n", " Tuncay\n", " bina\n", "\n", " apartmanı zemin kat \n", "\n", " Antakya\n", " ilce\n", "\n", " akrabalarım göçük altında #hatay #Afad
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "sentence = \"\"\"Lütfen yardım Akevler mahallesi Rüzgar sokak Tuncay apartmanı zemin kat Antakya akrabalarım göçük altında #hatay #Afad\"\"\"\n", "\n", "get_entities_html(sentence, nlp(sentence))" ] }, { "cell_type": "code", "execution_count": 22, "id": "80b823ff", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " Kahramanmaraş\n", " sehir\n", "\n", " \n", "\n", " merkez\n", " ilce\n", "\n", " \n", "\n", " Şazibey\n", " mahalle\n", "\n", " Mahallesi \n", "\n", " Ebrar\n", " site\n", "\n", " Sitesi \n", "\n", " Z\n", " bina\n", "\n", " blok arka tarafı için acil en az 150 tonluk vinç lazım lütfen paylaşır mısınız
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "sentence = \" \".join(dataset[\"train\"][433][\"tokens\"])\n", "get_entities_html(sentence, nlp(sentence))" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.12" } }, "nbformat": 4, "nbformat_minor": 5 }