# This application creates a Gradio interface for testing the speed of different tokenizers import gradio as gr import tiktoken import time from transformers import AutoTokenizer EXAMPLE_MODELS: list = ["gpt2"] TOKENIZERS : dict = {k: v for k, v in zip(EXAMPLE_MODELS, [AutoTokenizer.from_pretrained(m) for m in EXAMPLE_MODELS])} def get_tokenizer(model_name): if model_name in EXAMPLE_MODELS: return TOKENIZERS[model_name] else: return tiktoken.get_encoding("gpt2") def times_faster(time_1, time_2): return (time_2 / time_1) * 100 def run_hf_tokenizer(model_name, text): tokenizer = get_tokenizer(model_name) start = time.time() encoded = tokenizer.encode(text) end = time.time() elapsed_time = end - start print(f"Encoded: {encoded}") print(f"Time taken by HF tokenizer: {elapsed_time}") return elapsed_time, encoded def run_openai_tokenizer(text): tokenizer = tiktoken.get_encoding("gpt2") start = time.time() encoded = tokenizer.encode(text) end = time.time() elapsed_time = end - start print(f"Encoded: {encoded}") print(f"Time taken by OpenAI tokenizer: {elapsed_time}") return elapsed_time, encoded def run_tokenizers(model_name, text): hf_time, hf_encoded = run_hf_tokenizer(model_name, text) openai_time, openai_encoded = run_openai_tokenizer(text) return { "HF Tokenizer": { "Time Taken": hf_time, "Num tokens": len(hf_encoded) }, "OpenAI Tokenizer": { "Time Taken": openai_time, "Num Tokens": len(openai_encoded) }, "Times Faster": str(times_faster(hf_time, openai_time)) + "%" } iface = gr.Interface(fn=run_tokenizers, inputs=[gr.components.Dropdown(EXAMPLE_MODELS, label="Model Name"), gr.components.Textbox(lines=10, label="Text")], outputs="json", title="OpenAI Tokenizer vs HF Tokenizers Speed Test", examples = [ ["gpt2", "This is a test of the OpenAI tokenizer vs the HF tokenizer"], ["gpt2", """ State-of-the-art Machine Learning for PyTorch, TensorFlow, and JAX. 🤗 Transformers provides APIs and tools to easily download and train state-of-the-art pretrained models. Using pretrained models can reduce your compute costs, carbon footprint, and save you the time and resources required to train a model from scratch. These models support common tasks in different modalities, such as: 📝 Natural Language Processing: text classification, named entity recognition, question answering, language modeling, summarization, translation, multiple choice, and text generation. 🖼️ Computer Vision: image classification, object detection, and segmentation. 🗣️ Audio: automatic speech recognition and audio classification. 🐙 Multimodal: table question answering, optical character recognition, information extraction from scanned documents, video classification, and visual question answering. """] ] ) iface.launch()