Spaces:

vpkprasanna
/

TokenizerViz

Sleeping

File size: 1,753 Bytes

2d0a0f5
5dca0b0
 
 
2d0a0f5
5dca0b0
 
 
 
 
 
 
 
 
 
c6a1e30
5dca0b0
c6a1e30
 
 
5dca0b0
 
 
 
 
c6a1e30
5dca0b0
c6a1e30
5dca0b0
 
 
 
 
 
 
 
 
 
 
 
 
c6a1e30
5dca0b0
 
 
c6a1e30
 
5dca0b0
2d0a0f5
86eb1ac
b60f3da
c6a1e30

import gradio as gr
from transformers import AutoTokenizer
import ast
model_path = "models/"

import gradio as gr

# Available models
MODELS = ["Meta-Llama-3.1-8B"]

def process_input(input_type, input_value, model_name):
    # Initialize tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_path+model_name)
    
    if input_type == "Text":
        character_count = len(input_value)
        # Tokenize the text
        token_ids = tokenizer.encode(input_value,add_special_tokens=True)
        tokens = tokenizer.convert_ids_to_tokens(token_ids)
        return len(tokens),character_count, tokens, token_ids
    
    elif input_type == "Token IDs":
        try:
            token_ids = ast.literal_eval(input_value)
            # Convert token IDs back to text
            text = tokenizer.decode(token_ids)            
            # Create output strings
            return len(token_ids),len(token_ids), text, input_value,
        except ValueError:
            return "Error", "Invalid input. Please enter space-separated integers for Token IDs.", ""

# Create Gradio interface
iface = gr.Interface(
    fn=process_input,
    inputs=[
        gr.Radio(["Text", "Token IDs"], label="Input Type", value="Text"),
        gr.Textbox(lines=5, label="Input"),
        gr.Dropdown(choices=MODELS, label="Select Model")
    ],
    outputs=[
        gr.Textbox(label="Token Count"),
        gr.Textbox(label="Character Count"),
        gr.Textbox(label="Tokens", lines=10),
        gr.Textbox(label="Token IDS", lines=5)
    ],
    title="LLM Tokenization - Convert Text to tokens and vice versa!",
    description="Enter text or token IDs and select a model to see the results."
)

if __name__ == "__main__":
    iface.queue()
    iface.launch()