import gradio as gr import pandas as pd from gradio.themes import colors from transformers import AutoTokenizer # Function to map tokenized text to IDs def inference( text="", model_id="openai/clip-vit-large-patch14", ) -> (list[str, str], pd.DataFrame): if text == "": return [], pd.DataFrame() tokenizer = AutoTokenizer.from_pretrained(model_id) # Use tokenizer to tokenize the text text_inputs = tokenizer(text, return_tensors='pt') input_ids = text_inputs['input_ids'].tolist()[0] # Convert tensor to list # Create pairs of tokens and IDs tokens = [tokenizer.decode([id_]) for id_ in input_ids] token_pairs = [] for token, id_ in zip(tokens, input_ids): token_pairs.append((token, str(id_))) # Count the number of characters and tokens pos_count = pd.DataFrame({ "Char Count": [len(text)], "Token Count": [len(token_pairs)] }) return token_pairs, pos_count if __name__ == '__main__': iface = gr.Interface( fn=inference, inputs=[ gr.Textbox(label="Text"), gr.Dropdown( label="Model", choices=[ "openai/clip-vit-large-patch14", "google-bert/bert-base-uncased", "google/flan-t5-base", "openai-community/gpt2", "rinna/japanese-gpt-1b" ], value="openai/clip-vit-large-patch14" ), ], outputs=[ gr.Highlightedtext(label="Highlighted Text"), gr.Dataframe(label="Position Count"), ], examples=[ ["When I told my computer I needed a break, it froze.", "openai/clip-vit-large-patch14"], ["Yesterday, I thought my cat was studying for her degree in philosophy because she sat on my book, " "but turns out she was just trying to hatch a plot to steal my dinner.", "openai/clip-vit-large-patch14"], ["The square root of x is the cube root of y. What is y to the power of 2, if x = 4?", "google/flan-t5-base"], ["日本で一番高い山は富士山ですが、二番目に高い山は何ですか?", "rinna/japanese-gpt-1b"] ], cache_examples=True, title="TokenVisor", description="Visualize how the Tokenizer used in Hugging Face's Transformers library tokenizes text.", theme=gr.Theme(primary_hue=colors.green, secondary_hue=colors.yellow), allow_flagging="never", ) iface.launch()