from transformers import AutoTokenizer
import gradio as gr
import random

checkpoint = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
checkpoints = [
    checkpoint,
    "microsoft/phi-2",
    "openai/whisper-large-v3",
    "NousResearch/Nous-Hermes-2-Yi-34B",
    "bert-base-cased"
]

placeholder = "Type anything in this text box and hit Tokenize!"
sequences = [
    "The quick brown 🦊 fox jumps over the lazy 🐕 dog!",
    "How vexingly ⏩ quick daft 🦓 zebras jump?",
    "Pack my 📦 box with five dozen 🍷 liquor jugs.",
    "The five 🥊 boxing 🧙‍♂️ wizards jump quickly~",
    "While making deep ⛏️ excavations we found some quaint bronze 💍 jewelry!",
    "Whenever the 🦊 fox jumped, the 🐿️ squirrel gazed suspiciously...",
    "We promptly 🧑‍⚖️ judged antique ivory buckles for the next 🏆 prize."
    ]

def randomize_sequence():
    return random.choice(sequences)

sequence = randomize_sequence

def load_tokenizer(checkpoint):
    if not "tokenizer" in globals():
        global tokenizer
        tokenizer = AutoTokenizer.from_pretrained(checkpoint)
    try:
        if checkpoint == tokenizer.name_or_path:
            gr.Info(f"Tokenizer already loaded '{checkpoint}'")
        else:
            tokenizer = AutoTokenizer.from_pretrained(checkpoint)
        vocab = dict(sorted(tokenizer.vocab.items(), key=lambda item: item[1]))
        unk = next(iter(vocab))
        vocab.pop(unk)
        vocab_sorted = "\n".join(vocab)
        vocab_size = len(vocab)
        gr.Info(f"Tokenizer vocab size: {vocab_size}")
        return vocab_size, unk, vocab_sorted
    except Exception as error:
        gr.Warning(f"An unexpected error occurred while loading the Tokenizer.")
        gr.Warning(f"{error}")
        return None, None, None

def tokenize_er(checkpoint, sequence):
    vocab_size, unk, vocab_sorted = load_tokenizer(checkpoint)
    try:
        tokens = tokenizer.tokenize(sequence)
        ids = tokenizer.convert_tokens_to_ids(tokens)
        token_id_pair = []
        if len(tokens) == len(ids):
            for i in range(len(ids)):
                token_id_pair.append([tokens[i],ids[i]])
        return token_id_pair, vocab_size, unk, vocab_sorted
    except NameError:
        gr.Warning("Select Tokenizer before sequencing.")
        return [[None, None]], None, None, None

def de_tokenize_er(pairs):
    try:
        tokens = []
        ids = []
        for row in pairs:
            tokens.append(row[0])
            try:
                ids.append(int(row[1]))
            except:
                ids.append(0)
        tokens_ids= tokenizer.convert_tokens_to_ids(tokens)
        decoded_tokens = tokenizer.decode(tokens_ids)
        decoded_ids = tokenizer.decode(ids)
        return tokens_ids, decoded_tokens, decoded_ids
    except NameError:
        gr.Warning("Tokenize sequence before decoding.")
        return None, None, None

with gr.Blocks() as frontend:
    with gr.Row():
        with gr.Column(scale=3):
            gr.Markdown("# 🐇 Tokenizaminer\n### The Tokenizer Examiner, or the Tokeniza Miner... 🕵️🕳️\nThe purpose of this tool is to examine the vocabulary and tokens of a models tokenizer and play with the results.\nNote how the Vocabulary ID lines up with the full Vocabulary index on the right ➡️\n\n⚠️ Loading the vocabulary can take a few seconds.")
            with gr.Row():
                gr.Markdown("\n#### 1. Select Tokenizer\nSelect from the list or enter any model from 🤗 Hugging Face Models, it will only download the Tokenizer data! Image models won't work here.")
            with gr.Group():
                input_checkpoint = gr.Dropdown(label="Tokenizer", choices=checkpoints, value=checkpoint, allow_custom_value=True, show_label=False, container=False)
            with gr.Row():
                gr.Markdown("\n#### 2. Sequence & Tokenize")
            with gr.Row():
                input_sequence = gr.TextArea(label="Sequence", value=sequence, placeholder=placeholder, lines=3, interactive=True, show_label=False, container=False)
            with gr.Row():
                    btn_tokenize = gr.Button(value="Tokenize!")
                    btn_random_seq = gr.Button(value="Randomize!")
            with gr.Row():
                gr.Markdown("\n#### 3. Decode\nYou can select and edit each cell individually - then hit Decode!")
            with gr.Row():
                token_id_pair = gr.DataFrame(col_count=(2,"fixed"), headers=["Token","Vocabulary ID"], value=[[None,0]], type="array", datatype=["str", "number"], height=400, interactive=True)
            with gr.Row():
                btn_decode = gr.Button(value="Decode")
                btn_clear_pairs = gr.ClearButton(value="Clear Token/IDs", components=[token_id_pair])
            with gr.Row():
                with gr.Column():
                    output_decoded_token_ids = gr.TextArea(label="Re-encoded Tokens", interactive=False)
                    output_decoded_tokens = gr.TextArea(label="Decoded Re-encoded Tokens", interactive=False)
                with gr.Column():
                    output_decoded_ids = gr.TextArea(label="Decoded IDs", interactive=False)
        with gr.Column(scale=1):
            with gr.Group():
                gr.Markdown("\n#### 🎲 Tokenizer Data")
                output_vocab_count = gr.Number(label="Vocab Size", interactive=False)
                output_unknown_token = gr.Textbox(label="Unknown Token", interactive=False)
                output_vocab = gr.Code(label="Vocabulary IDs")

        btn_tokenize.click(fn=tokenize_er, inputs=[input_checkpoint, input_sequence], outputs=[token_id_pair, output_vocab_count,output_unknown_token, output_vocab], queue=True)
        btn_random_seq.click(fn=randomize_sequence, inputs=[], outputs=[input_sequence])
        btn_decode.click(fn=de_tokenize_er, inputs=[token_id_pair], outputs=[output_decoded_token_ids,output_decoded_tokens, output_decoded_ids])

frontend.launch()