Spaces:
Running
Running
import gradio as gr | |
import utils | |
import os | |
# Create a custom theme | |
theme = gr.themes.Base( | |
text_size="lg", | |
radius_size="none", | |
font=[gr.themes.GoogleFont('Source Sans 3'), 'ui-sans-serif', 'system-ui', 'sans-serif'], | |
) | |
# Load tokenizers only once during development | |
if gr.NO_RELOAD: | |
print("Loading tokenizers...") | |
all_tokenizers = utils.load_tokenizers() | |
all_tokenizer_names = list(all_tokenizers.keys()) | |
print("Tokenizers loaded!") | |
def read_svg_file(name: str) -> str: | |
"""Read SVG file content.""" | |
icon_map = { | |
"meta-llama/Llama-4-Scout-17B-16E-Instruct": "llama.svg", | |
"deepseek-ai/DeepSeek-V3-0324": "deepseek.svg", | |
"ZurichNLP/swissbert": "swissbert.svg", | |
"mistralai/Mistral-Nemo-Instruct-2407": "mistral.svg", | |
"google/gemma-3-27b-it": "gemma.svg", | |
"gpt-4o": "chatgpt.svg" | |
} | |
icon_path = os.path.join("icons", icon_map.get(name, "chatgpt.svg")) | |
try: | |
with open(icon_path, 'r') as f: | |
return f.read() | |
except Exception as e: | |
print(f"Error reading SVG file {icon_path}: {e}") | |
return "" | |
def get_model_icon(name: str) -> str: | |
"""Get the HTML for the model icon.""" | |
# Skip icons for collapsed models | |
if name in ["google/gemma-3-27b-it", "mistralai/Mistral-Nemo-Instruct-2407", "CohereLabs/aya-expanse-8b"]: | |
return "" | |
svg_content = read_svg_file(name) | |
if svg_content: | |
# Add viewBox and preserve aspect ratio to the SVG element | |
svg_content = svg_content.replace('<svg', '<svg preserveAspectRatio="xMidYMid meet" style="height: 24px; width: 24px;"') | |
# Wrap in a container that maintains aspect ratio | |
return f'<div style="display: inline-block; vertical-align: middle; margin-right: 8px; height: 24px; width: 24px; overflow: hidden;">{svg_content}</div>' | |
return "" | |
def process_text(text): | |
"""Process the input text and return visualizations for all tokenizers.""" | |
# Use the pre-loaded tokenizers | |
visualizations = utils.visualize_tokens(text, all_tokenizers) | |
return list(visualizations.values()) + [gr.update(visible=True)] | |
# Create the Gradio interface | |
with gr.Blocks(title="Tokens matter.", theme=theme, css=""" | |
.tokenizer-panel > div { background: var(--input-background-fill); } | |
.no-padding { padding: 0 !important; } | |
.form { border: 0 !important; } | |
.html-container { line-height: 2em; !important; } | |
.pending { opacity: 1; } | |
@media (prefers-color-scheme: dark) { | |
.gradio-container.gradio-container-5-29-0 .contain .html-container span.model-name { color: white !important; } | |
.html-container span { color: black !important; } | |
} | |
""") as demo: | |
gr.Markdown("# Tokens matter.") | |
with gr.Row(): | |
# Left column for inputs | |
with gr.Column(scale=1): | |
input_text = gr.Textbox( | |
label="Input Text:", | |
placeholder="Enter text to tokenize ...", | |
value="Als Zürcher bini nöd so Fan vom FC Basel.", | |
lines=3, | |
elem_classes="no-padding", | |
interactive=True, | |
every=True, # This enables real-time updates | |
) | |
# Right column for outputs | |
with gr.Column(scale=2): | |
# Create output boxes for main tokenizers | |
main_output_boxes = [] | |
more_output_boxes = [] | |
# Create 2x2 grid for main tokenizers | |
with gr.Row(): | |
with gr.Column(): | |
for name in all_tokenizer_names[:2]: | |
if name in ["google/gemma-3-27b-it", "mistralai/Mistral-Nemo-Instruct-2407", "CohereLabs/aya-expanse-8b"]: | |
continue | |
display_name = utils.MODEL_DISPLAY_NAMES.get(name, name) | |
with gr.Group(elem_classes="tokenizer-panel"): | |
gr.HTML(value=f'<div style="display: flex; align-items: center; margin-bottom: 8px;">{get_model_icon(name)}<span class="model-name" style="font-weight: bold;">{display_name}</span></div>') | |
box = gr.HTML() | |
main_output_boxes.append(box) | |
with gr.Column(): | |
for name in all_tokenizer_names[2:4]: | |
if name in ["google/gemma-3-27b-it", "mistralai/Mistral-Nemo-Instruct-2407", "CohereLabs/aya-expanse-8b"]: | |
continue | |
display_name = utils.MODEL_DISPLAY_NAMES.get(name, name) | |
with gr.Group(elem_classes="tokenizer-panel"): | |
gr.HTML(value=f'<div style="display: flex; align-items: center; margin-bottom: 8px;">{get_model_icon(name)}<span class="model-name" style="font-weight: bold;">{display_name}</span></div>') | |
box = gr.HTML() | |
main_output_boxes.append(box) | |
# Display more tokenizers in accordion | |
more_models = gr.Accordion("More Models", open=False, visible=False) | |
with more_models: | |
for name in all_tokenizer_names: | |
if name in ["google/gemma-3-27b-it", "mistralai/Mistral-Nemo-Instruct-2407", "CohereLabs/aya-expanse-8b"]: | |
display_name = utils.MODEL_DISPLAY_NAMES.get(name, name) | |
with gr.Group(elem_classes="tokenizer-panel"): | |
gr.HTML(value=f'<div style="display: flex; align-items: center; margin-bottom: 8px;">{get_model_icon(name)}<span class="model-name" style="font-weight: bold;">{display_name}</span></div>') | |
box = gr.HTML() | |
more_output_boxes.append(box) | |
all_outputs = main_output_boxes + more_output_boxes + [more_models] | |
# Use change event for real-time updates | |
input_text.change( | |
fn=process_text, | |
inputs=[input_text], | |
outputs=all_outputs, | |
show_progress="hidden", | |
) | |
# Add examples | |
gr.Examples( | |
examples=[ | |
["Als Zürcher bini nöd so Fan vom FC Basel."], | |
["Als Zürcher bin ich nicht sonderlich Fan des FC Basel."], | |
["En tant que Zurichois, je ne suis pas un grand fan du FC Bâle."], | |
["Come Zurighese, non sono un grande fan del FC Basilea."], | |
["Sco Turitgais na sun jau betg in grond fan da l'FC Basilea."], | |
["As a Zurich resident, I am not a big fan of FC Basel."], | |
], | |
inputs=input_text | |
) | |
if __name__ == "__main__": | |
demo.launch() | |