mrm8488 commited on
Commit
f434980
β€’
1 Parent(s): 7496b6b

First commit

Browse files
Files changed (4) hide show
  1. .gitignore +1 -0
  2. README.md +1 -1
  3. app.py +78 -0
  4. requirements.txt +4 -0
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ .devcontainer/*
README.md CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- title: Tokenizer Comparator
3
  emoji: πŸƒ
4
  colorFrom: red
5
  colorTo: blue
 
1
  ---
2
+ title: Tokenizers Comparator
3
  emoji: πŸƒ
4
  colorFrom: red
5
  colorTo: blue
app.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This application creates a Gradio interface for testing the speed of different tokenizers
2
+ import gradio as gr
3
+ import tiktoken
4
+ import time
5
+ from transformers import AutoTokenizer
6
+
7
+ EXAMPLE_MODELS: list = ["gpt2"]
8
+ TOKENIZERS : dict = {k: v for k, v in zip(EXAMPLE_MODELS, [AutoTokenizer.from_pretrained(m) for m in EXAMPLE_MODELS])}
9
+
10
+ def get_tokenizer(model_name):
11
+ if model_name in EXAMPLE_MODELS:
12
+ return TOKENIZERS[model_name]
13
+ else:
14
+ return tiktoken.get_encoding("gpt2")
15
+
16
+
17
+ def times_faster(time_1, time_2):
18
+ return (time_2 / time_1) * 100
19
+
20
+
21
+ def run_hf_tokenizer(model_name, text):
22
+ tokenizer = get_tokenizer(model_name)
23
+ start = time.time()
24
+ encoded = tokenizer.encode(text)
25
+ end = time.time()
26
+ elapsed_time = end - start
27
+ print(f"Encoded: {encoded}")
28
+ print(f"Time taken by HF tokenizer: {elapsed_time}")
29
+ return elapsed_time, encoded
30
+
31
+
32
+ def run_openai_tokenizer(text):
33
+ tokenizer = tiktoken.get_encoding("gpt2")
34
+ start = time.time()
35
+ encoded = tokenizer.encode(text)
36
+ end = time.time()
37
+ elapsed_time = end - start
38
+ print(f"Encoded: {encoded}")
39
+ print(f"Time taken by OpenAI tokenizer: {elapsed_time}")
40
+ return elapsed_time, encoded
41
+
42
+
43
+ def run_tokenizers(model_name, text):
44
+ hf_time, hf_encoded = run_hf_tokenizer(model_name, text)
45
+ openai_time, openai_encoded = run_openai_tokenizer(text)
46
+ return {
47
+ "HF Tokenizer": {
48
+ "Time Taken": hf_time,
49
+ "Num tokens": len(hf_encoded)
50
+ },
51
+ "OpenAI Tokenizer": {
52
+ "Time Taken": openai_time,
53
+ "Num Tokens": len(openai_encoded)
54
+ },
55
+ "Times Faster": str(times_faster(hf_time, openai_time)) + "%"
56
+ }
57
+
58
+ iface = gr.Interface(fn=run_tokenizers,
59
+ inputs=[gr.components.Dropdown(EXAMPLE_MODELS, label="Model Name"),
60
+ gr.components.Textbox(lines=10, label="Text")],
61
+ outputs="json",
62
+ title="OpenAI Tokenizer vs HF Tokenizers Speed Test",
63
+ examples = [
64
+ ["gpt2", "This is a test of the OpenAI tokenizer vs the HF tokenizer"],
65
+ ["gpt2", """
66
+ State-of-the-art Machine Learning for PyTorch, TensorFlow, and JAX.
67
+
68
+ πŸ€— Transformers provides APIs and tools to easily download and train state-of-the-art pretrained models. Using pretrained models can reduce your compute costs, carbon footprint, and save you the time and resources required to train a model from scratch. These models support common tasks in different modalities, such as:
69
+
70
+ πŸ“ Natural Language Processing: text classification, named entity recognition, question answering, language modeling, summarization, translation, multiple choice, and text generation.
71
+ πŸ–ΌοΈ Computer Vision: image classification, object detection, and segmentation.
72
+ πŸ—£οΈ Audio: automatic speech recognition and audio classification.
73
+ πŸ™ Multimodal: table question answering, optical character recognition, information extraction from scanned documents, video classification, and visual question answering.
74
+ """]
75
+ ]
76
+ )
77
+
78
+ iface.launch()
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ gradio
2
+ transformers
3
+ tokenizers
4
+ tiktoken