Spaces:

Prgckwb
/

tokenvisor

Sleeping

tokenvisor / app.py

Prgckwb

Add progress bar and catch error

ca79346 9 months ago

3.44 kB

	import os

	import gradio as gr
	import pandas as pd
	from gradio.themes import colors
	from transformers import AutoTokenizer

	os.environ['TOKENIZERS_PARALLELISM'] = "false"


	# Function to map tokenized text to IDs
	def inference(
	text="",
	model_id="openai/clip-vit-large-patch14",
	progress=gr.Progress()
	) -> (list[str, str], list[str, str], pd.DataFrame):
	if text == "":
	return [], [], pd.DataFrame()

	progress(0, desc='Loading tokenizer...')
	tokenizer = AutoTokenizer.from_pretrained(model_id)

	# Use tokenizer to tokenize the text
	progress(0.5, desc='Tokenizing text...')
	text_inputs = tokenizer(text, return_tensors='pt')

	input_ids = text_inputs['input_ids'].tolist()[0] # Convert tensor to list

	# Create pairs of tokens and IDs
	tokens = [tokenizer.decode([id_]) for id_ in input_ids]
	token_pairs = []

	for token, id_ in zip(tokens, input_ids):
	token_pairs.append((token, str(id_)))

	# Count the number of characters and tokens
	pos_count = pd.DataFrame({
	"Char Count": [len(text)],
	"Token Count": [len(token_pairs)]
	})

	# Create list of special tokens
	special_tokens = []
	for k, v in tokenizer.special_tokens_map.items():
	if k == 'additional_special_tokens':
	continue
	sp_token_map = [str(k), str(v)]
	special_tokens.append(sp_token_map)

	return token_pairs, special_tokens, pos_count


	if __name__ == '__main__':
	iface = gr.Interface(
	fn=inference,
	inputs=[
	gr.Textbox(label="Text"),
	gr.Dropdown(
	label="Model",
	choices=[
	"openai/clip-vit-large-patch14",
	"google/gemma-7b",
	"google-bert/bert-base-uncased",
	"google/flan-t5-base",
	"openai-community/gpt2",
	"rinna/japanese-gpt-1b",
	"cyberagent/open-calm-7b",
	],
	value="openai/clip-vit-large-patch14"
	),
	],
	outputs=[
	gr.Highlightedtext(label="Highlighted Text"),
	gr.Highlightedtext(label="Special Tokens", combine_adjacent=True, adjacent_separator=' / '),
	gr.Dataframe(label="Position Count"),
	],
	examples=[
	["When I told my computer I needed a break, it froze.", "openai/clip-vit-large-patch14"],
	["Yesterday, I thought my cat was studying for her degree in philosophy because she sat on my book, "
	"but turns out she was just trying to hatch a plot to steal my dinner.", "openai/clip-vit-large-patch14"],
	["The square root of x is the cube root of y. What is y to the power of 2, if x = 4?",
	"google/flan-t5-base"],
	["In my home country, it's a custom to say 'いただきマサチューセッツ' before we start eating a meal.",
	"google/gemma-7b"],
	["日本で一番高い山は富士山ですが、二番目に高い山は何ですか？", "rinna/japanese-gpt-1b"],
	],
	cache_examples=True,
	title="TokenVisor 👀",
	description="Visualize how the Tokenizer used in Hugging Face's Transformers library tokenizes text.",
	theme=gr.Theme(primary_hue=colors.green, secondary_hue=colors.yellow),
	allow_flagging="never",

	)
	iface.queue().launch()