Jordan Legg
working great!
a71870f
raw
history blame
2.92 kB
import gradio as gr
from transformers import T5TokenizerFast, CLIPTokenizer
def count_tokens(text):
# Load the common tokenizers
t5_tokenizer = T5TokenizerFast.from_pretrained("google/t5-v1_1-xxl", legacy=False)
clip_tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
# Get tokens and their IDs
t5_tokens = t5_tokenizer.encode(text, return_tensors="pt")[0].tolist()
clip_tokens = clip_tokenizer.encode(text)
# Decode individual tokens for display, replacing whitespace with visible characters
t5_decoded = []
for token in t5_tokens:
decoded = t5_tokenizer.decode([token])
# Replace whitespace with visible characters and empty strings with special markers
if decoded.isspace():
decoded = "␣" # visible space marker
elif decoded == "":
decoded = "βˆ…" # empty token marker
t5_decoded.append(decoded)
clip_decoded = []
for token in clip_tokens:
decoded = clip_tokenizer.decode([token])
if decoded.isspace():
decoded = "␣"
elif decoded == "":
decoded = "βˆ…"
clip_decoded.append(decoded)
# Create highlighted text tuples (text, label)
t5_highlights = [(token, f"Token {i}") for i, token in enumerate(t5_decoded)]
clip_highlights = [(token, f"Token {i}") for i, token in enumerate(clip_decoded)]
return (
# T5 outputs
len(t5_tokens),
t5_highlights,
str(t5_tokens),
# CLIP outputs
len(clip_tokens),
clip_highlights,
str(clip_tokens)
)
# Create a Gradio interface with custom layout
with gr.Blocks(title="Common Diffusion Model Token Counter") as iface:
gr.Markdown("# Common Diffusion Model Token Counter")
gr.Markdown("Enter text to count tokens using T5 and CLIP tokenizers, commonly used in diffusion models.")
with gr.Row():
text_input = gr.Textbox(label="Diffusion Prompt", placeholder="Enter your prompt here...")
with gr.Row():
# T5 Column
with gr.Column():
gr.Markdown("### T5 Tokenizer Results")
t5_count = gr.Number(label="T5 Token Count")
t5_highlights = gr.HighlightedText(label="T5 Tokens", show_legend=True)
t5_ids = gr.Textbox(label="T5 Token IDs", lines=2)
# CLIP Column
with gr.Column():
gr.Markdown("### CLIP Tokenizer Results")
clip_count = gr.Number(label="CLIP Token Count")
clip_highlights = gr.HighlightedText(label="CLIP Tokens", show_legend=True)
clip_ids = gr.Textbox(label="CLIP Token IDs", lines=2)
text_input.change(
fn=count_tokens,
inputs=[text_input],
outputs=[t5_count, t5_highlights, t5_ids, clip_count, clip_highlights, clip_ids]
)
# Launch the app
iface.launch(show_error=True)