Spaces:

freddyaboulton
/

gpt-oss-tokenizer-playground

Sleeping

App Files Files Community

gpt-oss-tokenizer-playground / app.py

freddyaboulton HF Staff

Create app.py

4a104a5 verified 10 days ago

raw

history blame contribute delete

4.9 kB

	import gradio as gr
	from transformers import AutoTokenizer

	tokenizer = AutoTokenizer.from_pretrained("openai/gpt-oss-20b")


	def tokenize_dialogue(dialogue_data):
	"""
	Tokenize the dialogue using the GPT-OSS tokenizer
	"""
	if tokenizer is None:
	raise ValueError("Tokenizer not loaded. Please check your installation.")

	messages = []
	for message in dialogue_data:
	role = message.get("speaker", "user")
	content = message.get("text", "")

	if role == "system":
	messages.append({"role": "system", "content": content})
	elif role == "user":
	messages.append({"role": "user", "content": content})
	elif role == "assistant":
	messages.append({"role": "assistant", "content": content})

	formatted_input = tokenizer.apply_chat_template(
	messages,
	add_generation_prompt=True,
	return_tensors="np"
	)

	token_ids = formatted_input[0].tolist()
	decoded_text = []
	colors = ["#FF6B6B", "#4ECDC4", "#45B7D1", "#96CEB4", "#FFEAA7"]
	color_map = {}

	for i, token_id in enumerate(token_ids):
	color = colors[i % len(colors)]
	if token_id not in color_map:
	color_map[str(token_id)] = color
	decoded_text.append((tokenizer.decode([token_id]), str(token_id)))

	print("decoded_text", decoded_text)

	return gr.HighlightedText(value=decoded_text, color_map=color_map), len(token_ids)

	def create_sample_dialogue():
	"""
	Create a sample dialogue for demonstration
	"""
	return [
	{"speaker": "system", "text": "You are a helpful assistant."},
	{"speaker": "user", "text": "Hello! How are you today?"},
	{"speaker": "assistant", "text": "I'm doing well, thank you for asking! How can I help you today?"},
	{"speaker": "user", "text": "Can you explain what MXFP4 quantization is?"}
	]

	with gr.Blocks(title="GPT-OSS Tokenizer Explorer") as demo:
	gr.Markdown("# GPT-OSS Tokenizer Explorer")
	gr.Markdown("Enter a dialogue and see how the GPT-OSS tokenizer processes it. Use the format `speaker: message` in the dialogue component.")

	with gr.Row():
	with gr.Column(scale=1):
	gr.Markdown("### Input Dialogue")

	dialogue_input = gr.Dialogue(
	speakers=["system", "user", "assistant"],
	label="Enter your dialogue",
	placeholder="Type 'system:', 'user:', or 'assistant:' followed by your message",
	show_submit_button=True,
	show_copy_button=True,
	type="dialogue",
	ui_mode="dialogue-only",
	)

	with gr.Row():
	sample_btn = gr.Button("Load Sample", variant="secondary")
	clear_btn = gr.Button("Clear", variant="secondary")

	with gr.Column(scale=1):
	gr.Markdown("### Tokenization Results")

	highlighted_output = gr.HighlightedText(
	label="Tokenized Output",
	show_inline_category=False
	)

	token_count = gr.Label(
	value="Total Tokens: 0",
	label="Token Count"
	)

	with gr.Accordion("How to use", open=False):
	gr.Markdown("""
	### Instructions:
	1. Enter dialogue: Use the dialogue component to enter conversations
	2. Speaker format: Type `system:`, `user:`, or `assistant:` followed by your message
	3. Submit: Click 'Tokenize Dialogue' to process the conversation
	4. View results: See the tokenization details in the output area

	### Example:
	```
	system: You are a helpful assistant.
	user: Hello! How are you today?
	assistant: I'm doing well, thank you for asking!
	```

	### What you'll see:
	- Total tokens: Number of tokens in the conversation
	- Tokenized output: How the tokenizer formats the conversation
	""")

	def process_dialogue(dialogue):
	if not dialogue:
	return "Please enter some dialogue first.", {}, "Total Tokens: 0"

	result_text, token_count_val = tokenize_dialogue(dialogue)

	return result_text, f"Total Tokens: {token_count_val}"

	def clear_dialogue():
	return None, [], "Total Tokens: 0"

	sample_btn.click(
	fn=create_sample_dialogue,
	outputs=[dialogue_input]
	)

	clear_btn.click(
	fn=clear_dialogue,
	outputs=[dialogue_input, highlighted_output, token_count]
	)

	dialogue_input.submit(
	fn=process_dialogue,
	inputs=[dialogue_input],
	outputs=[highlighted_output, token_count]
	)

	if __name__ == "__main__":
	demo.launch()