|
import gradio as gr |
|
from transformers import AutoTokenizer |
|
import random |
|
import colorsys |
|
import html |
|
|
|
def get_distinct_colors(n): |
|
colors = [] |
|
for i in range(n): |
|
h = i / n |
|
s = 0.6 |
|
v = 0.7 |
|
r, g, b = colorsys.hsv_to_rgb(h, s, v) |
|
color = "#{:02x}{:02x}{:02x}".format(int(r*255), int(g*255), int(b*255)) |
|
colors.append(color) |
|
return colors |
|
|
|
def tokenize_text(hf_model_id, text, token=None): |
|
try: |
|
tokenizer = AutoTokenizer.from_pretrained(hf_model_id, access_token=token) |
|
tokens = tokenizer.tokenize(text) |
|
token_count = len(tokens) |
|
colors = get_distinct_colors(token_count) |
|
colored_tokens = [] |
|
for i, token in enumerate(tokens): |
|
display_token = token.replace('Ġ', '<space>') |
|
display_token = html.escape(display_token) |
|
colored_tokens.append(f'<span style="background-color: {colors[i]}; color: white; padding: 2px 4px; border-radius: 3px; margin: 2px; display: inline-block;">{display_token}</span>') |
|
tokenized_text = "".join(colored_tokens) |
|
return token_count, tokenized_text |
|
except Exception as e: |
|
return f"Error: {str(e)}", "" |
|
|
|
demo = gr.Interface( |
|
fn=tokenize_text, |
|
inputs=[ |
|
gr.Textbox(label="Hugging Face Model ID", placeholder="unsloth/gemma-3-27b-it", value="unsloth/gemma-3-27b-it"), |
|
gr.Textbox(label="Text to Tokenize", lines=5, placeholder="Enter your text here..."), |
|
gr.Textbox(label="HuggingFace Token (optional)", placeholder="hf_...", lines=1) |
|
], |
|
outputs=[ |
|
gr.Number(label="Token Count"), |
|
gr.HTML(label="Tokens", container=True, show_label=True) |
|
], |
|
title="HuggingFace Tokenizer", |
|
description="Enter a HuggingFace model ID and text to see how it gets tokenized. Provide a huggingface token if the model is gated.", |
|
allow_flagging="never" |
|
) |
|
|
|
demo.launch() |