Spaces:

Taranosaurus
/

Tokenizaminer

Running

App Files Files Community

Tokenizaminer / app.py

Taranosaurus

Changed "Unknown Token" token to "Token 0"

507d429 11 months ago

raw

history blame

6.77 kB

	from transformers import AutoTokenizer
	import gradio as gr
	import random

	checkpoint = "dslim/bert-base-NER"
	checkpoints = [
	checkpoint,
	"TinyLlama/TinyLlama-1.1B-Chat-v1.0",
	"microsoft/phi-2",
	"openai/whisper-large-v3",
	"NousResearch/Nous-Hermes-2-Yi-34B",
	"bert-base-cased"
	]

	placeholder = "Type anything in this text box and hit Tokenize!"
	sequences = [
	"The quick brown 🦊 fox jumps over the lazy 🐕 dog!",
	"How vexingly ⏩ quick daft 🦓 zebras jump?",
	"Pack my 📦 box with five dozen 🍷 liquor jugs.",
	"The five 🥊 boxing 🧙‍♂️ wizards jump quickly~",
	"While making deep ⛏️ excavations we found some quaint bronze 💍 jewelry!",
	"Whenever the 🦊 fox jumped, the 🐿️ squirrel gazed suspiciously...",
	"We promptly 🧑‍⚖️ judged antique ivory buckles for the next 🏆 prize."
	]

	def randomize_sequence():
	return random.choice(sequences)

	sequence = randomize_sequence

	def load_vocab(target_model, current_model):
	checkpoint = target_model
	if target_model == current_model:
	gr.Info(f"Tokenizer already loaded: {checkpoint}")
	else:
	load_tokenizer(checkpoint)
	gr.Info(f"Tokenizer loaded: {checkpoint}")
	vocab = dict(sorted(tokenizer.vocab.items(), key=lambda item: item[1]))
	unk = next(iter(vocab))
	vocab.pop(unk)
	vocab_sorted = "\n".join(vocab)
	vocab_size = len(vocab)
	gr.Info(f"Tokenizer vocab size: {vocab_size}")
	return checkpoint, vocab_size, unk, vocab_sorted

	def load_tokenizer(checkpoint):
	if not "tokenizer" in globals():
	global tokenizer
	if len(checkpoint) > 0:
	try:
	tokenizer = AutoTokenizer.from_pretrained(checkpoint)
	except Exception as error:
	gr.Warning("Unexpected error!")
	raise gr.Error(f"{error}")
	else:
	return ValueError("Tokenizer cannot be empty!")

	def tokenize_er(checkpoint, sequence):
	try:
	load_tokenizer(checkpoint)
	tokens = tokenizer.tokenize(sequence)
	ids = tokenizer.convert_tokens_to_ids(tokens)
	token_id_pair = []
	if len(tokens) == len(ids):
	for i in range(len(ids)):
	token_id_pair.append([tokens[i],ids[i]])
	return token_id_pair
	except NameError:
	gr.Warning("Select Tokenizer before sequencing.")
	return [[None, None]]
	except Exception as error:
	gr.Warning("Unexpected error!")
	raise gr.Error(f"{error}")

	def de_tokenize_er(checkpoint, pairs):
	try:
	load_tokenizer(checkpoint)
	tokens = []
	ids = []
	for row in pairs:
	tokens.append(row[0])
	try:
	ids.append(int(row[1]))
	except:
	ids.append(0)
	tokens_ids= tokenizer.convert_tokens_to_ids(tokens)
	decoded_tokens = tokenizer.decode(tokens_ids)
	decoded_ids = tokenizer.decode(ids)
	return tokens_ids, decoded_tokens, decoded_ids
	except NameError:
	gr.Warning("Tokenize sequence before decoding.")
	return None, None, None
	except Exception as error:
	gr.Warning("Unexpected error!")
	raise gr.Error(f"{error}")

	with gr.Blocks() as frontend:
	with gr.Row():
	with gr.Column(scale=3):
	gr.Markdown("# 🐇 Tokenizaminer\n### The Tokenizer Examiner, or the Tokeniza Miner... 🕵️🕳️\nThe purpose of this tool is to examine the vocabulary and tokens of a models tokenizer and play with the results.\nNote how the Vocabulary ID lines up with the full Vocabulary index on the right ➡️\n\n⚠️ Loading the full vocabulary can take a few seconds and the browser might stutter.")
	with gr.Row():
	gr.Markdown("\n#### 1. Select Tokenizer\nSelect from the list or enter any model from 🤗 Hugging Face Models, it will only download the Tokenizer data! Image models won't work here.")
	with gr.Row():
	input_checkpoint = gr.Dropdown(label="Tokenizer", choices=checkpoints, value=checkpoint, allow_custom_value=True, show_label=False, container=False)
	#btn_load_vocab = gr.Button(value="Load Vocabulary")
	with gr.Row():
	gr.Markdown("\n#### 2. Sequence & Tokenize")
	with gr.Row():
	input_sequence = gr.TextArea(label="Sequence", value=sequence, placeholder=placeholder, lines=3, interactive=True, show_label=False, container=False)
	with gr.Row():
	btn_tokenize = gr.Button(value="Tokenize!")
	btn_random_seq = gr.Button(value="Randomize!")
	with gr.Row():
	gr.Markdown("\n#### 3. Decode\nYou can select and edit each cell individually - then hit Decode!")
	with gr.Row():
	token_id_pair = gr.DataFrame(col_count=(2,"fixed"), headers=["Token","Vocabulary ID"], value=[[None,0]], type="array", datatype=["str", "number"], height=400, interactive=True)
	with gr.Row():
	btn_decode = gr.Button(value="Decode")
	btn_clear_pairs = gr.ClearButton(value="Clear Token/IDs", components=[token_id_pair])
	with gr.Row():
	with gr.Column():
	output_decoded_token_ids = gr.TextArea(label="Re-encoded Tokens", interactive=False)
	output_decoded_tokens = gr.TextArea(label="Decoded Re-encoded Tokens", interactive=False)
	with gr.Column():
	output_decoded_ids = gr.TextArea(label="Decoded IDs", interactive=False)
	with gr.Column(scale=1):
	with gr.Group():
	gr.Markdown("### 🎲 Tokenizer Data")
	output_checkpoint = gr.Textbox(visible=False)
	output_vocab_count = gr.Number(label="Vocab Size", interactive=False)
	output_token_zero = gr.Textbox(label="Token 0", interactive=False)
	output_vocab = gr.Code(label="Vocabulary IDs")

	input_checkpoint.change(fn=load_vocab, inputs=[input_checkpoint, output_checkpoint], outputs=[output_checkpoint, output_vocab_count, output_token_zero, output_vocab], queue=True)
	btn_tokenize.click(fn=tokenize_er, inputs=[input_checkpoint, input_sequence], outputs=[token_id_pair], queue=True)
	btn_random_seq.click(fn=randomize_sequence, inputs=[], outputs=[input_sequence])
	btn_decode.click(fn=de_tokenize_er, inputs=[input_checkpoint, token_id_pair], outputs=[output_decoded_token_ids,output_decoded_tokens, output_decoded_ids], queue=True)
	frontend.load(fn=load_vocab, inputs=[input_checkpoint, output_checkpoint], outputs=[output_checkpoint, output_vocab_count, output_token_zero, output_vocab], queue=True)

	frontend.launch()