import sentencepiece as spm
import torch
import torch.nn.functional as F
from transformers.models.bert.tokenization_bert import BertTokenizer

BASELINE = "baseline"
KOBE_ATTRIBUTE = "kobe-attr"
KOBE_KNOWLEDGE = "kobe-know"
KOBE_FULL = "kobe-full"


def get_bert_vocab_size(vocab_path: str) -> int:
    tokenizer = BertTokenizer.from_pretrained(vocab_path)
    return tokenizer.vocab_size


def get_vocab_size(vocab_path: str) -> int:
    tokenizer = spm.SentencePieceProcessor()
    tokenizer.Load(vocab_path)
    return len(tokenizer)


# Metrics
def accuracy(logits: torch.Tensor, targets: torch.Tensor) -> float:
    assert logits.dim() == 2
    assert targets.dim() == 1
    pred = logits.argmax(dim=1)
    return (pred == targets).sum().item() / targets.shape[0]


def top_k_top_p_sampling(
    logits, top_k=0, top_p=0.0, temperature=1, filter_value=-float("Inf")
) -> int:
    """Sample from a filtered distribution of logits using top-k and/or nucleus (top-p) filtering
    Args:
        logits: logits distribution shape (vocabulary size)
        top_k >0: keep only top k tokens with highest probability (top-k filtering).
        top_p >0.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering).
            Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751)
    """
    logits /= temperature
    assert (
        logits.dim() == 1
    )  # batch size 1 for now - could be updated for more but the code would be less clear
    top_k = min(top_k, logits.size(-1))  # Safety check
    if top_k > 0:
        # Remove all tokens with a probability less than the last token of the top-k
        indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
        logits[indices_to_remove] = filter_value

    if top_p > 0.0:
        sorted_logits, sorted_indices = torch.sort(logits, descending=True)
        cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)

        # Remove tokens with cumulative probability above the threshold
        sorted_indices_to_remove = cumulative_probs > top_p
        # Shift the indices to the right to keep also the first token above the threshold
        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
        sorted_indices_to_remove[..., 0] = 0

        indices_to_remove = sorted_indices[sorted_indices_to_remove]
        logits[indices_to_remove] = filter_value

    # Sample from the filtered distribution
    probabilities = F.softmax(logits, dim=-1)
    next_token = torch.multinomial(probabilities, 1)

    return int(next_token.item())


def diversity(tokenized_lines, n=4) -> int:
    """Defined as the unique number of ngrams generated on the test set."""
    n_grams_all = []
    for line in tokenized_lines:
        n_grams = list(zip(*[line[i:] for i in range(n)]))
        n_grams_all += n_grams

    return len(set(n_grams_all))