# This is a heavily adapted version of this notebook:
# https://github.com/huggingface/notebooks/blob/main/examples/text_classification.ipynb ,
# where we show on a simple text classification problem how we can integrate
# components for uncertainty quantification into large pretrained models.

import evaluate
import numpy as np
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    TrainerCallback,
)
from uq import BertForUQSequenceClassification

BATCH_SIZE = 16
EVAL_BATCH_SIZE = 128
DEVICE = "cpu"

# cola dataset for determining whether sentences are gramatically correct
task = "cola"
model_checkpoint = "bert-base-uncased"
dataset = load_dataset("glue", task)
metric = evaluate.load("glue", task)

# Load our tokenizer and tokenize our data as it streams in
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)


def tokenize_data(data):
    # Will add input ID and attention mask columns to dataset
    return tokenizer(data["sentence"], truncation=True)


encoded_dataset = dataset.map(tokenize_data, batched=True)

# Now we can load our pretrained model and introduce our uncertainty quantification component,
# which in this case is a GP final layer without any spectral normalization of the transformer weights
num_labels = 2
id2label = {0: "Invalid", 1: "Valid"}
label2id = {val: key for key, val in id2label.items()}
model = BertForUQSequenceClassification.from_pretrained(
    model_checkpoint, num_labels=num_labels, id2label=id2label, label2id=label2id
)


# Specify training arguments
metric_name = "matthews_correlation"
model_name = model_checkpoint.split("/")[-1]

args = TrainingArguments(
    f"{model_name}-finetuned-{task}",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=EVAL_BATCH_SIZE,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    push_to_hub=True,
    use_mps_device=False,
    no_cuda=True,
)


# Set up metric tracking
def compute_metrics(eval_predictions):
    predictions, labels = eval_predictions
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)


# Finally, set up trainer for finetuning the model
model.to(DEVICE)
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)


# Add in a callback to reset the covariance matrix after each epoch, as we only need
# to do this once at the final epoch, so we don't double count any of the data. We
# could use a more elegant solution, but the covariance computation is very cheap
# so doing it ~5 times rather than once isn't a big deal.
class ResetCovarianceCallback(TrainerCallback):
    def __init__(self, trainer) -> None:
        super().__init__()
        self._trainer = trainer

    def on_epoch_end(self, args, state, control, **kwargs):
        if control.should_evaluate:
            self._trainer.model.classifier.reset_cov()


trainer.add_callback(ResetCovarianceCallback(trainer))

trainer.train()

trainer.push_to_hub()