jvamvas's picture
Enable dark mode
35e7f94
import gradio as gr
import utils
import os
# Create a custom theme
theme = gr.themes.Base(
text_size="lg",
radius_size="none",
font=[gr.themes.GoogleFont('Source Sans 3'), 'ui-sans-serif', 'system-ui', 'sans-serif'],
)
# Load tokenizers only once during development
if gr.NO_RELOAD:
print("Loading tokenizers...")
all_tokenizers = utils.load_tokenizers()
all_tokenizer_names = list(all_tokenizers.keys())
print("Tokenizers loaded!")
def read_svg_file(name: str) -> str:
"""Read SVG file content."""
icon_map = {
"meta-llama/Llama-4-Scout-17B-16E-Instruct": "llama.svg",
"deepseek-ai/DeepSeek-V3-0324": "deepseek.svg",
"ZurichNLP/swissbert": "swissbert.svg",
"mistralai/Mistral-Nemo-Instruct-2407": "mistral.svg",
"google/gemma-3-27b-it": "gemma.svg",
"gpt-4o": "chatgpt.svg"
}
icon_path = os.path.join("icons", icon_map.get(name, "chatgpt.svg"))
try:
with open(icon_path, 'r') as f:
return f.read()
except Exception as e:
print(f"Error reading SVG file {icon_path}: {e}")
return ""
def get_model_icon(name: str) -> str:
"""Get the HTML for the model icon."""
# Skip icons for collapsed models
if name in ["google/gemma-3-27b-it", "mistralai/Mistral-Nemo-Instruct-2407", "CohereLabs/aya-expanse-8b"]:
return ""
svg_content = read_svg_file(name)
if svg_content:
# Add viewBox and preserve aspect ratio to the SVG element
svg_content = svg_content.replace('<svg', '<svg preserveAspectRatio="xMidYMid meet" style="height: 24px; width: 24px;"')
# Wrap in a container that maintains aspect ratio
return f'<div style="display: inline-block; vertical-align: middle; margin-right: 8px; height: 24px; width: 24px; overflow: hidden;">{svg_content}</div>'
return ""
def process_text(text):
"""Process the input text and return visualizations for all tokenizers."""
# Use the pre-loaded tokenizers
visualizations = utils.visualize_tokens(text, all_tokenizers)
return list(visualizations.values()) + [gr.update(visible=True)]
# Create the Gradio interface
with gr.Blocks(title="Tokens matter.", theme=theme, css="""
.tokenizer-panel > div { background: var(--input-background-fill); }
.no-padding { padding: 0 !important; }
.form { border: 0 !important; }
.html-container { line-height: 2em; !important; }
.pending { opacity: 1; }
@media (prefers-color-scheme: dark) {
.gradio-container.gradio-container-5-29-0 .contain .html-container span.model-name { color: white !important; }
.html-container span { color: black !important; }
}
""") as demo:
gr.Markdown("# Tokens matter.")
with gr.Row():
# Left column for inputs
with gr.Column(scale=1):
input_text = gr.Textbox(
label="Input Text:",
placeholder="Enter text to tokenize ...",
value="Als Zürcher bini nöd so Fan vom FC Basel.",
lines=3,
elem_classes="no-padding",
interactive=True,
every=True, # This enables real-time updates
)
# Right column for outputs
with gr.Column(scale=2):
# Create output boxes for main tokenizers
main_output_boxes = []
more_output_boxes = []
# Create 2x2 grid for main tokenizers
with gr.Row():
with gr.Column():
for name in all_tokenizer_names[:2]:
if name in ["google/gemma-3-27b-it", "mistralai/Mistral-Nemo-Instruct-2407", "CohereLabs/aya-expanse-8b"]:
continue
display_name = utils.MODEL_DISPLAY_NAMES.get(name, name)
with gr.Group(elem_classes="tokenizer-panel"):
gr.HTML(value=f'<div style="display: flex; align-items: center; margin-bottom: 8px;">{get_model_icon(name)}<span class="model-name" style="font-weight: bold;">{display_name}</span></div>')
box = gr.HTML()
main_output_boxes.append(box)
with gr.Column():
for name in all_tokenizer_names[2:4]:
if name in ["google/gemma-3-27b-it", "mistralai/Mistral-Nemo-Instruct-2407", "CohereLabs/aya-expanse-8b"]:
continue
display_name = utils.MODEL_DISPLAY_NAMES.get(name, name)
with gr.Group(elem_classes="tokenizer-panel"):
gr.HTML(value=f'<div style="display: flex; align-items: center; margin-bottom: 8px;">{get_model_icon(name)}<span class="model-name" style="font-weight: bold;">{display_name}</span></div>')
box = gr.HTML()
main_output_boxes.append(box)
# Display more tokenizers in accordion
more_models = gr.Accordion("More Models", open=False, visible=False)
with more_models:
for name in all_tokenizer_names:
if name in ["google/gemma-3-27b-it", "mistralai/Mistral-Nemo-Instruct-2407", "CohereLabs/aya-expanse-8b"]:
display_name = utils.MODEL_DISPLAY_NAMES.get(name, name)
with gr.Group(elem_classes="tokenizer-panel"):
gr.HTML(value=f'<div style="display: flex; align-items: center; margin-bottom: 8px;">{get_model_icon(name)}<span class="model-name" style="font-weight: bold;">{display_name}</span></div>')
box = gr.HTML()
more_output_boxes.append(box)
all_outputs = main_output_boxes + more_output_boxes + [more_models]
# Use change event for real-time updates
input_text.change(
fn=process_text,
inputs=[input_text],
outputs=all_outputs,
show_progress="hidden",
)
# Add examples
gr.Examples(
examples=[
["Als Zürcher bini nöd so Fan vom FC Basel."],
["Als Zürcher bin ich nicht sonderlich Fan des FC Basel."],
["En tant que Zurichois, je ne suis pas un grand fan du FC Bâle."],
["Come Zurighese, non sono un grande fan del FC Basilea."],
["Sco Turitgais na sun jau betg in grond fan da l'FC Basilea."],
["As a Zurich resident, I am not a big fan of FC Basel."],
],
inputs=input_text
)
if __name__ == "__main__":
demo.launch()