Frinkleko's picture
Create app.py
6dbdb73 verified
import gradio as gr
from transformers import RobertaTokenizer
import pandas as pd
import json
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
def process_text(text, include_special_tokens=False, show_attention_mask=False):
encoding = tokenizer(text, return_tensors="np", padding=True, truncation=True)
tokens = tokenizer.tokenize(text)
token_ids = tokenizer.encode(text)
if not include_special_tokens:
tokens = tokens
token_ids = token_ids[1:-1]
token_info = []
for token, token_id in zip(tokens, token_ids):
info = {
"Token": token,
"ID": token_id,
}
if show_attention_mask:
info["Attention Mask"] = encoding["attention_mask"][0][len(token_info)]
token_info.append(info)
df = pd.DataFrame(token_info)
stats = f"""
Number of tokens: {len(tokens)}
Input text length: {len(text)}
Tokens/character ratio: {len(tokens)/len(text):.2f}
Vocabulary size: {tokenizer.vocab_size}
"""
json_output = json.dumps(
{
"input_ids": token_ids,
"tokens": tokens,
},
indent=2,
)
return df, stats, json_output
iface = gr.Interface(
fn=process_text,
inputs=[
gr.Textbox(
lines=5, placeholder="Enter text to tokenize...", label="Input Text"
),
gr.Checkbox(label="Include Special Tokens", value=False),
gr.Checkbox(label="Show Attention Mask", value=False),
],
outputs=[
gr.Dataframe(
headers=["Token", "ID", "Attention Mask"], label="Tokenization Results"
),
gr.Textbox(label="Statistics", lines=4),
gr.JSON(label="JSON Output"),
],
title="RoBERTa Tokenizer Playground",
description="""
An interactive demonstration of the RoBERTa tokenizer.
""",
theme="default",
)
if __name__ == "__main__":
iface.launch(share=True)