|
import gradio as gr |
|
from transformers import AutoTokenizer |
|
|
|
tokenizer = AutoTokenizer.from_pretrained("openai/gpt-oss-20b") |
|
|
|
|
|
def tokenize_dialogue(dialogue_data): |
|
""" |
|
Tokenize the dialogue using the GPT-OSS tokenizer |
|
""" |
|
if tokenizer is None: |
|
raise ValueError("Tokenizer not loaded. Please check your installation.") |
|
|
|
messages = [] |
|
for message in dialogue_data: |
|
role = message.get("speaker", "user") |
|
content = message.get("text", "") |
|
|
|
if role == "system": |
|
messages.append({"role": "system", "content": content}) |
|
elif role == "user": |
|
messages.append({"role": "user", "content": content}) |
|
elif role == "assistant": |
|
messages.append({"role": "assistant", "content": content}) |
|
|
|
formatted_input = tokenizer.apply_chat_template( |
|
messages, |
|
add_generation_prompt=True, |
|
return_tensors="np" |
|
) |
|
|
|
token_ids = formatted_input[0].tolist() |
|
decoded_text = [] |
|
colors = ["#FF6B6B", "#4ECDC4", "#45B7D1", "#96CEB4", "#FFEAA7"] |
|
color_map = {} |
|
|
|
for i, token_id in enumerate(token_ids): |
|
color = colors[i % len(colors)] |
|
if token_id not in color_map: |
|
color_map[str(token_id)] = color |
|
decoded_text.append((tokenizer.decode([token_id]), str(token_id))) |
|
|
|
print("decoded_text", decoded_text) |
|
|
|
return gr.HighlightedText(value=decoded_text, color_map=color_map), len(token_ids) |
|
|
|
def create_sample_dialogue(): |
|
""" |
|
Create a sample dialogue for demonstration |
|
""" |
|
return [ |
|
{"speaker": "system", "text": "You are a helpful assistant."}, |
|
{"speaker": "user", "text": "Hello! How are you today?"}, |
|
{"speaker": "assistant", "text": "I'm doing well, thank you for asking! How can I help you today?"}, |
|
{"speaker": "user", "text": "Can you explain what MXFP4 quantization is?"} |
|
] |
|
|
|
with gr.Blocks(title="GPT-OSS Tokenizer Explorer") as demo: |
|
gr.Markdown("# GPT-OSS Tokenizer Explorer") |
|
gr.Markdown("Enter a dialogue and see how the GPT-OSS tokenizer processes it. Use the format `speaker: message` in the dialogue component.") |
|
|
|
with gr.Row(): |
|
with gr.Column(scale=1): |
|
gr.Markdown("### Input Dialogue") |
|
|
|
dialogue_input = gr.Dialogue( |
|
speakers=["system", "user", "assistant"], |
|
label="Enter your dialogue", |
|
placeholder="Type 'system:', 'user:', or 'assistant:' followed by your message", |
|
show_submit_button=True, |
|
show_copy_button=True, |
|
type="dialogue", |
|
ui_mode="dialogue-only", |
|
) |
|
|
|
with gr.Row(): |
|
sample_btn = gr.Button("Load Sample", variant="secondary") |
|
clear_btn = gr.Button("Clear", variant="secondary") |
|
|
|
with gr.Column(scale=1): |
|
gr.Markdown("### Tokenization Results") |
|
|
|
highlighted_output = gr.HighlightedText( |
|
label="Tokenized Output", |
|
show_inline_category=False |
|
) |
|
|
|
token_count = gr.Label( |
|
value="Total Tokens: 0", |
|
label="Token Count" |
|
) |
|
|
|
with gr.Accordion("How to use", open=False): |
|
gr.Markdown(""" |
|
### Instructions: |
|
1. **Enter dialogue**: Use the dialogue component to enter conversations |
|
2. **Speaker format**: Type `system:`, `user:`, or `assistant:` followed by your message |
|
3. **Submit**: Click 'Tokenize Dialogue' to process the conversation |
|
4. **View results**: See the tokenization details in the output area |
|
|
|
### Example: |
|
``` |
|
system: You are a helpful assistant. |
|
user: Hello! How are you today? |
|
assistant: I'm doing well, thank you for asking! |
|
``` |
|
|
|
### What you'll see: |
|
- **Total tokens**: Number of tokens in the conversation |
|
- **Tokenized output**: How the tokenizer formats the conversation |
|
""") |
|
|
|
def process_dialogue(dialogue): |
|
if not dialogue: |
|
return "Please enter some dialogue first.", {}, "Total Tokens: 0" |
|
|
|
result_text, token_count_val = tokenize_dialogue(dialogue) |
|
|
|
return result_text, f"Total Tokens: {token_count_val}" |
|
|
|
def clear_dialogue(): |
|
return None, [], "Total Tokens: 0" |
|
|
|
sample_btn.click( |
|
fn=create_sample_dialogue, |
|
outputs=[dialogue_input] |
|
) |
|
|
|
clear_btn.click( |
|
fn=clear_dialogue, |
|
outputs=[dialogue_input, highlighted_output, token_count] |
|
) |
|
|
|
dialogue_input.submit( |
|
fn=process_dialogue, |
|
inputs=[dialogue_input], |
|
outputs=[highlighted_output, token_count] |
|
) |
|
|
|
if __name__ == "__main__": |
|
demo.launch() |