import torch
import json
from transformer_model import TransformerModel

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# # Hyperparameters
d_model = 512  # Dimension of the embeddings and the token representations
seq_length = 10  # Length of the input and output sequences
vocab_size = 25672  # Size of the vocabulary
batch_size = 32  # Batch size for training
num_heads = 8  # Number of heads in multi-head attention
dim_feedforward = 2048  # Dimension of feedforward network in encoder and decoder

# Assuming the TransformerModel class is defined in the script
model = TransformerModel(vocab_size, d_model, num_heads, dim_feedforward, seq_length)
model.load_state_dict(torch.load('transformer_model.pth'))
model.eval()  # Set the model to evaluation mode

# Load the vocabulary
with open('vocabulary.json', 'r') as vocab_file:
    vocab = json.load(vocab_file)

if '<unk>' not in vocab:
    # Assign the next integer index to <unk>
    vocab['<unk>'] = len(vocab)


def text_to_tensor(text, vocab, seq_length):
    tokens = text.split()
    indices = [vocab.get(token, vocab['<unk>']) for token in tokens]  # Replace unknown tokens with <unk>
    indices = indices[:seq_length]
    indices += [vocab['<pad>']] * (seq_length - len(indices))
    return torch.tensor(indices, dtype=torch.long).unsqueeze(0)  # Add batch dimension


input_text = "please make the"
input_tensor = text_to_tensor(input_text, vocab, seq_length)
src = input_tensor
tgt = input_tensor


# def generate_square_subsequent_mask(sz):
#     mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
#     mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
#     return mask

# def create_padding_mask(seq):
#     return (seq == vocab['<pad>']).transpose(0, 1)

# Function to generate a square subsequent mask
def generate_square_subsequent_mask(sz):
    mask = torch.triu(torch.ones(sz, sz, device=device), diagonal=1).bool()
    return mask

# Function to create padding mask
def create_padding_mask(seq):
    mask = (seq == 0).transpose(0, 1) # Assuming 0 is the padding index
    return mask

src_seq_len = src.size(1)
tgt_seq_len = tgt.size(1)

src_mask = generate_square_subsequent_mask(src_seq_len)
# src_mask = torch.zeros((src_seq_len, src_seq_len)).type(torch.bool)
tgt_mask = generate_square_subsequent_mask(tgt_seq_len)
src_key_padding_mask = create_padding_mask(src)
tgt_key_padding_mask = create_padding_mask(tgt)

src.size()
tgt.size()
src_mask.size()
tgt_mask.size()
src_key_padding_mask.size()
tgt_key_padding_mask.size()

with torch.no_grad():
    output = model(src, tgt, src_mask, tgt_mask, 
                   src_key_padding_mask.transpose(0, 1), tgt_key_padding_mask.transpose(0, 1))


predicted_indices = torch.argmax(output, dim=-1).squeeze(0).tolist()
predicted_indices

inverse_vocab = {value: key for key, value in vocab.items()}

import itertools

flattened_list = list(itertools.chain.from_iterable(predicted_indices))

[inverse_vocab[key] for key in flattened_list]


def generate_prediction(text, model, vocab, seq_length):
    model.eval()  # Make sure the model is in eval mode

    # Convert text to tensor
    input_tensor = text_to_tensor(text, vocab, seq_length)

    # Generate prediction
    with torch.no_grad():
        output = model(input_tensor, input_tensor)  # For simplicity, using the same tensor as src and tgt

    # Convert output tensor to tokens (you may need additional post-processing)
    predicted_indices = torch.argmax(output, dim=-1).squeeze(0).tolist()
    predicted_tokens = [vocab[index] for index in predicted_indices]

    return predicted_tokens


# Example usage
text = """Here were the servants of your adversary And 
yours"""

prediction = generate_prediction(text, model, vocab, seq_length)
print(prediction)