In [None]:
# !pip install seqeval

In [None]:
# import torch
# torch.cuda.is_available(), torch.cuda.device_count()

In [None]:
import warnings
warnings.filterwarnings('ignore')

import pickle
import numpy as np
import transformers
from transformers import Trainer
from datasets import load_metric
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import TrainingArguments
from transformers import AutoModelForTokenClassification
from transformers import DataCollatorForTokenClassification

## Helpful funcs 

In [None]:
def align_labels_with_tokens(labels: list, word_ids: list) -> list:
 """
 Repeat label for each splitted token

 :param labels: list of entities token
 :type labels: list
 :param word_ids: list of word ids (repeadted if word was splitted)
 :type word_ids: list
 :return: list of aligned labels for tokenized sequence
 :rtype: list
 """
 return [-100 if i is None else labels[i] for i in word_ids]

def tokenize_and_align_labels(examples):
 """
 Tokenizing input sequence with corresponding labels

 :param examples: DatasetDict object with sequences and label ids
 :type examples: DatasetDict
 :return: DatasetDict with tokenizer output
 :rtype: DatasetDict
 """
 tokenized_inputs = tokenizer(
 examples["sequences"], truncation=True, is_split_into_words=True
 )
 all_labels = examples["ids"]
 new_labels = []
 for i, labels in enumerate(all_labels):
 word_ids = tokenized_inputs.word_ids(i)
 new_labels.append(align_labels_with_tokens(labels, word_ids))

 tokenized_inputs["labels"] = new_labels
 return tokenized_inputs

def compute_metrics(eval_preds):
 """
 Function for evaluate model
 
 :param eval_preds: model output
 :type eval_preds: 
 """
 logits, labels = eval_preds
 predictions = np.argmax(logits, axis=-1)

 # Remove ignored index (special tokens) and convert to labels
 true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
 true_predictions = [[label_names[p] for (p, l) in zip(prediction, label) if l != -100]
 for prediction, label in zip(predictions, labels)
 ]
 all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
 return {
 "precision": all_metrics["overall_precision"],
 "recall": all_metrics["overall_recall"],
 "f1": all_metrics["overall_f1"],
 "accuracy": all_metrics["overall_accuracy"],
 }

## Load Data

In [None]:
raw_datasets = load_dataset("surdan/nerel_short")

In [None]:
raw_datasets

## Preprocess data

In [None]:
model_checkpoint = "cointegrated/LaBSE-en-ru"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
tokenized_datasets = raw_datasets.map(
 tokenize_and_align_labels,
 batched=True,
 remove_columns=raw_datasets["train"].column_names,
)

In [None]:
tokenized_datasets

## Init Training pipeline

In [None]:
with open('id_to_label_map.pickle', 'rb') as f:
 map_id_to_label = pickle.load(f)

In [None]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [None]:
id2label = {str(k): v for k, v in map_id_to_label.items()}
label2id = {v: k for k, v in id2label.items()}
label_names = list(id2label.values())

In [None]:
model = AutoModelForTokenClassification.from_pretrained(
 model_checkpoint,
 id2label=id2label,
 label2id=label2id,
)

In [None]:
model.config.num_labels

In [None]:
args = TrainingArguments(
 "LaBSE_ner_nerel",
 evaluation_strategy="epoch",
 save_strategy="no",
 learning_rate=2e-5,
 num_train_epochs=25,
 weight_decay=0.01,
 push_to_hub=False,
 per_device_train_batch_size = 4 ## depending on the total volume of memory of your GPU
)

## Train model

In [None]:
## for compute_metrics function
metric = load_metric("seqeval")

In [None]:
trainer = Trainer(
 model=model,
 args=args,
 train_dataset=tokenized_datasets["train"],
 eval_dataset=tokenized_datasets["dev"],
 data_collator=data_collator,
 compute_metrics=compute_metrics,
 tokenizer=tokenizer,
)
trainer.train()

In [None]:
trainer.save_model("LaBSE_nerel_last_checkpoint")