import os.path

from tokenizers import ByteLevelBPETokenizer, Tokenizer
from typing import Dict, List, Any
from transformers import pipeline, PretrainedConfig
import numpy as np
import torch
import torch.nn as nn
from huggingface_hub import PyTorchModelHubMixin

device = "cpu"
embedding_dim = 128
rnn_units = 256
vocab_size = 8000


def clean_text_line(line):
    line = line.strip()  # Remove leading and trailing whitespace
    line = line.lower()  # Lowercase the text if your model expects lowercase input
    line = line.replace("'", "")
    line = line.replace('"', "")
    line = line.replace(",", "")
    line = line.replace(".", "")
    line = line.replace("?", "")
    line = line.replace("!", "")
    line = line.replace(":", "")
    line = line.replace(";", "")
    line = line.replace("(", "")
    line = line.replace(")", "")
    return line


def clean_text(text_lines):
    if isinstance(text_lines, list):
        cleaned_lines = []
        for line in text_lines:
            # Example cleaning steps, adjust these based on your specific needs
            line = clean_text_line(line)
            cleaned_lines.append(line)
        return cleaned_lines
    else:
        return clean_text_line(text_lines)


def spelling_error_rate(text_generated, vocabulary):
    # remove double quotes
    words = text_generated.split()
    words = clean_text(words)
    total_words = len(words)
    misspelled_words = [word for word in words if word.lower() not in vocabulary]

    if total_words == 0:
        return 0
    error_rate = len(misspelled_words) / total_words
    return error_rate


def generate_text_bpe(
    model,
    start_string,
    generation_length=1000,
    top_k=20,
    temperature=1.0,
    vocabulary=None,
    tokenizer=None,
):

    # Encode the start string to token IDs
    input_ids = tokenizer.encode(start_string).ids
    input_eval = torch.tensor([input_ids], device=device)

    # Empty string to store the results
    text_generated = []

    # Initialize hidden state
    state_h, state_c = model.init_state(1)

    model.eval()  # Evaluation mode

    with torch.no_grad():
        for i in range(generation_length):
            output, (state_h, state_c) = model(input_eval, (state_h, state_c))

            # Apply temperature scaling
            logits = output[0, -1] / temperature

            probabilities = torch.nn.functional.softmax(logits, dim=0).cpu().numpy()

            # Apply top-k sampling
            sorted_indices = np.argsort(probabilities)[-top_k:]
            top_probabilities = probabilities[sorted_indices]
            top_probabilities /= np.sum(top_probabilities)  # Normalize probabilities

            predicted_id = np.random.choice(sorted_indices, p=top_probabilities)

            # Pass the predicted token ID as the next input to the model
            input_eval = torch.tensor([[predicted_id]], device=device)

            # Decode the predicted token ID to text
            predicted_text = tokenizer.decode([predicted_id])

            # Append the predicted text to the generated text
            text_generated.append(predicted_text)

    generated_text = start_string + "".join(text_generated)
    error_rate = spelling_error_rate(generated_text, vocabulary)
    return error_rate, generated_text


def load_vocabulary_from_file(file_path):
    vocabulary = set()
    with open(file_path, "r", encoding="utf-8") as file:
        for line in file:
            word = line.strip()  # Remove any leading/trailing whitespace
            if word:  # Ensure the line is not empty
                vocabulary.add(
                    word.lower()
                )  # Add the word in lowercase to ensure consistency
    return vocabulary


def get_model():
    class AurelioRNN(nn.Module, PyTorchModelHubMixin):
        def __init__(self, config: dict):
            super().__init__()
            self.config = PretrainedConfig()
            self.config.vocab_size = config.get("vocab_size")
            self.config.embedding_dim = config.get("embedding_dim")
            self.config.rnn_units = config.get("rnn_units")
            self.embedding = nn.Embedding(vocab_size, embedding_dim)
            self.lstm = nn.LSTM(embedding_dim, rnn_units, batch_first=True)
            self.fc = nn.Linear(rnn_units, vocab_size)

        def forward(self, x, state):
            x = self.embedding(x)
            x, state = self.lstm(x, state)
            x = self.fc(x)
            return x, state

        def init_state(self, batch_size):
            return (
                torch.zeros(1, batch_size, rnn_units).to("cpu"),
                torch.zeros(1, batch_size, rnn_units).to("cpu"),
            )

    return AurelioRNN


def calculate_perplexity_on_text(model, text, seq_length, tokenizer):
    loss_fn = nn.CrossEntropyLoss()

    model.eval()
    total_loss = 0
    total_words = 0

    # Tokenize the text
    encoded = tokenizer.encode(text)
    ids = encoded.ids
    if len(ids) <= seq_length:
        print(
            "Input text is too short to calculate perplexity. length:",
            len(ids),
            "seq_length:",
            seq_length,
        )
        return float(
            "inf"
        )

    inputs = [ids[i : i + seq_length] for i in range(len(ids) - seq_length)]
    targets = [ids[i + 1 : i + seq_length + 1] for i in range(len(ids) - seq_length)]

    state_h, state_c = model.init_state(1)

    with torch.no_grad():
        for i in range(len(inputs)):
            input_tensor = torch.tensor(inputs[i]).unsqueeze(0).to(device)
            target_tensor = torch.tensor(targets[i]).unsqueeze(0).to(device)

            output, (state_h, state_c) = model(
                input_tensor, (state_h.detach(), state_c.detach())
            )
            loss = loss_fn(output.transpose(1, 2), target_tensor)
            total_loss += loss.item()
            total_words += seq_length

    average_loss = total_loss / total_words

    perplexity = np.exp(average_loss)
    return perplexity


class EndpointHandler:
    def __init__(self, path=""):
        # load the optimized model
        lstm = get_model()
        config = {
            "vocab_size": vocab_size,
            "embedding_dim": embedding_dim,
            "rnn_units": rnn_units,
        }
        self.model = lstm.from_pretrained("jed-tiotuico/aurelio-rnn", config=config)

        dir_path = os.path.abspath(os.path.dirname(__file__))
        self.tokenizer = ByteLevelBPETokenizer(
            os.path.join(dir_path, "aurelio_bpe-vocab.json"),
            os.path.join(dir_path, "aurelio_bpe-merges.txt"),
        )
        # create inference pipeline


    def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
        """
        Args:
            data (:obj:):
                includes the input data and the parameters for the inference.
        Return:
            A :obj:`list`:. The object returned should be a list of one list like [[{"label": 0.9939950108528137}]] containing :
                - "label": A string representing what the label/class is. There can be multiple labels.
                - "score": A score between 0 and 1 describing how confident the model is for this label/class.
        """
        print("data", data)
        inputs = data.pop("inputs", data)
        start_string = inputs[0]
        config = {
            "vocab_size": vocab_size,
            "embedding_dim": embedding_dim,
            "rnn_units": rnn_units,
        }
        lstm = get_model()
        model = lstm.from_pretrained("jed-tiotuico/aurelio-rnn", config=config)
        model.eval()  # Set the model to evaluation mode

        dir_path = os.path.abspath(os.path.dirname(__file__))
        # Load the Kapampangan vocabulary
        kapampangan_vocabulary = load_vocabulary_from_file(os.path.join(dir_path, "kapampangan.txt"))
        seq_length = 64

        tokenizer = ByteLevelBPETokenizer(
            os.path.join(dir_path, "aurelio_bpe-vocab.json"),
            os.path.join(dir_path, "aurelio_bpe-merges.txt"),
        )
        predictions = []
        # Generate 10 samples
        for i in range(10):
            error_rate, generated_text = generate_text_bpe(
                model,
                start_string=start_string,
                generation_length=seq_length,
                temperature=1.2,
                top_k=20,
                vocabulary=kapampangan_vocabulary,
                tokenizer=tokenizer,
            )
            perplexity = calculate_perplexity_on_text(
                model, generated_text, seq_length=seq_length - 1, tokenizer=tokenizer
            )
            predictions.append(
                {
                    "label": error_rate,
                    "score": 1 - error_rate,
                    "generated_text": generated_text,
                    "perplexity": perplexity
                }
            )

        return predictions