{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3ca08817",
   "metadata": {},
   "outputs": [],
   "source": [
    "# !pip install seqeval"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c5958200",
   "metadata": {},
   "outputs": [],
   "source": [
    "# import torch\n",
    "# torch.cuda.is_available(), torch.cuda.device_count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "590c3f48",
   "metadata": {},
   "outputs": [],
   "source": [
    "import warnings\n",
    "warnings.filterwarnings('ignore')\n",
    "\n",
    "import pickle\n",
    "import numpy as np\n",
    "import transformers\n",
    "from transformers import Trainer\n",
    "from datasets import load_metric\n",
    "from datasets import load_dataset\n",
    "from transformers import AutoTokenizer\n",
    "from transformers import TrainingArguments\n",
    "from transformers import AutoModelForTokenClassification\n",
    "from transformers import DataCollatorForTokenClassification"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "44d7c35c",
   "metadata": {},
   "source": [
    "## Helpful funcs "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5c9e36d9",
   "metadata": {},
   "outputs": [],
   "source": [
    "def align_labels_with_tokens(labels: list, word_ids: list) -> list:\n",
    "    \"\"\"\n",
    "    Repeat label for each splitted token\n",
    "\n",
    "    :param labels: list of entities token\n",
    "    :type labels: list\n",
    "    :param word_ids: list of word ids (repeadted if word was splitted)\n",
    "    :type word_ids: list\n",
    "    :return: list of aligned labels for tokenized sequence\n",
    "    :rtype: list\n",
    "    \"\"\"\n",
    "    return [-100 if i is None else labels[i] for i in word_ids]\n",
    "\n",
    "def tokenize_and_align_labels(examples):\n",
    "    \"\"\"\n",
    "    Tokenizing input sequence with corresponding labels\n",
    "\n",
    "    :param examples: DatasetDict object with sequences and label ids\n",
    "    :type examples: DatasetDict\n",
    "    :return: DatasetDict with tokenizer output\n",
    "    :rtype: DatasetDict\n",
    "    \"\"\"\n",
    "    tokenized_inputs = tokenizer(\n",
    "        examples[\"sequences\"], truncation=True, is_split_into_words=True\n",
    "    )\n",
    "    all_labels = examples[\"ids\"]\n",
    "    new_labels = []\n",
    "    for i, labels in enumerate(all_labels):\n",
    "        word_ids = tokenized_inputs.word_ids(i)\n",
    "        new_labels.append(align_labels_with_tokens(labels, word_ids))\n",
    "\n",
    "    tokenized_inputs[\"labels\"] = new_labels\n",
    "    return tokenized_inputs\n",
    "\n",
    "def compute_metrics(eval_preds):\n",
    "    \"\"\"\n",
    "    Function for evaluate model\n",
    "    \n",
    "    :param eval_preds: model output\n",
    "    :type eval_preds: \n",
    "    \"\"\"\n",
    "    logits, labels = eval_preds\n",
    "    predictions = np.argmax(logits, axis=-1)\n",
    "\n",
    "    # Remove ignored index (special tokens) and convert to labels\n",
    "    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]\n",
    "    true_predictions = [[label_names[p] for (p, l) in zip(prediction, label) if l != -100]\n",
    "                        for prediction, label in zip(predictions, labels)\n",
    "                        ]\n",
    "    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)\n",
    "    return {\n",
    "        \"precision\": all_metrics[\"overall_precision\"],\n",
    "        \"recall\": all_metrics[\"overall_recall\"],\n",
    "        \"f1\": all_metrics[\"overall_f1\"],\n",
    "        \"accuracy\": all_metrics[\"overall_accuracy\"],\n",
    "    }"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8760e709",
   "metadata": {},
   "source": [
    "## Load Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e8c723f7",
   "metadata": {},
   "outputs": [],
   "source": [
    "raw_datasets = load_dataset(\"surdan/nerel_short\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e540a898",
   "metadata": {},
   "outputs": [],
   "source": [
    "raw_datasets"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5a4947d1",
   "metadata": {},
   "source": [
    "## Preprocess data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8829557e",
   "metadata": {},
   "outputs": [],
   "source": [
    "model_checkpoint = \"cointegrated/LaBSE-en-ru\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b6c13ad1",
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ea2c1a9e",
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenized_datasets = raw_datasets.map(\n",
    "    tokenize_and_align_labels,\n",
    "    batched=True,\n",
    "    remove_columns=raw_datasets[\"train\"].column_names,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b15c3cf1",
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenized_datasets"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e9b5b9b1",
   "metadata": {},
   "source": [
    "## Init Training pipeline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b24d86e3",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('id_to_label_map.pickle', 'rb') as f:\n",
    "    map_id_to_label = pickle.load(f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1d90a6d9",
   "metadata": {},
   "outputs": [],
   "source": [
    "data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3d890df2",
   "metadata": {},
   "outputs": [],
   "source": [
    "id2label = {str(k): v for k, v in map_id_to_label.items()}\n",
    "label2id = {v: k for k, v in id2label.items()}\n",
    "label_names = list(id2label.values())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "31bcfd6c",
   "metadata": {},
   "outputs": [],
   "source": [
    "model = AutoModelForTokenClassification.from_pretrained(\n",
    "    model_checkpoint,\n",
    "    id2label=id2label,\n",
    "    label2id=label2id,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "84497580",
   "metadata": {},
   "outputs": [],
   "source": [
    "model.config.num_labels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1ccfbf74",
   "metadata": {},
   "outputs": [],
   "source": [
    "args = TrainingArguments(\n",
    "    \"LaBSE_ner_nerel\",\n",
    "    evaluation_strategy=\"epoch\",\n",
    "    save_strategy=\"no\",\n",
    "    learning_rate=2e-5,\n",
    "    num_train_epochs=25,\n",
    "    weight_decay=0.01,\n",
    "    push_to_hub=False,\n",
    "    per_device_train_batch_size = 4 ## depending on the total volume of memory of your GPU\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c798d567",
   "metadata": {},
   "source": [
    "## Train model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1348d188",
   "metadata": {},
   "outputs": [],
   "source": [
    "## for compute_metrics function\n",
    "metric = load_metric(\"seqeval\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5cff0367",
   "metadata": {},
   "outputs": [],
   "source": [
    "trainer = Trainer(\n",
    "    model=model,\n",
    "    args=args,\n",
    "    train_dataset=tokenized_datasets[\"train\"],\n",
    "    eval_dataset=tokenized_datasets[\"dev\"],\n",
    "    data_collator=data_collator,\n",
    "    compute_metrics=compute_metrics,\n",
    "    tokenizer=tokenizer,\n",
    ")\n",
    "trainer.train()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "576a10f4",
   "metadata": {},
   "outputs": [],
   "source": [
    "trainer.save_model(\"LaBSE_nerel_last_checkpoint\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "451d6db1",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "hf_env",
   "language": "python",
   "name": "hf_env"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}