Taranosaurus commited on
Commit
a5212f9
β€’
1 Parent(s): d222a38

Initial commit of the app.py

Browse files
Files changed (1) hide show
  1. app.py +113 -0
app.py ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer
2
+ import gradio as gr
3
+ import random
4
+
5
+ checkpoint = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
6
+ checkpoints = [
7
+ checkpoint,
8
+ "microsoft/phi-2",
9
+ "openai/whisper-large-v3",
10
+ "NousResearch/Nous-Hermes-2-Yi-34B",
11
+ "bert-base-cased"
12
+ ]
13
+
14
+ placeholder = "Type anything in this text box and hit Tokenize!"
15
+ sequences = [
16
+ "The quick brown 🦊 fox jumps over the lazy πŸ• dog!",
17
+ "How vexingly ⏩ quick daft πŸ¦“ zebras jump?",
18
+ "Pack my πŸ“¦ box with five dozen 🍷 liquor jugs.",
19
+ "The five πŸ₯Š boxing πŸ§™β€β™‚οΈ wizards jump quickly~",
20
+ "While making deep ⛏️ excavations we found some quaint bronze πŸ’ jewelry!",
21
+ "Whenever the 🦊 fox jumped, the 🐿️ squirrel gazed suspiciously...",
22
+ "We promptly πŸ§‘β€βš–οΈ judged antique ivory buckles for the next πŸ† prize."
23
+ ]
24
+
25
+ def randomize_sequence():
26
+ return random.choice(sequences)
27
+
28
+ sequence = randomize_sequence
29
+
30
+ def load_tokenizer(checkpoint):
31
+ if not "tokenizer" in globals():
32
+ global tokenizer
33
+ tokenizer = None
34
+ try:
35
+ tokenizer = AutoTokenizer.from_pretrained(checkpoint)
36
+ vocab = dict(sorted(tokenizer.vocab.items(), key=lambda item: item[1]))
37
+ unk = next(iter(vocab))
38
+ vocab.pop(unk)
39
+ vocab_sorted = "\n".join(vocab)
40
+ vocab_size = len(vocab)
41
+ gr.Info(f"Tokenizer loaded '{checkpoint}' with vocab size: {vocab_size}")
42
+ #return checkpoint, vocab_size, vocab
43
+ return vocab_size, unk, vocab_sorted
44
+ except Exception as error:
45
+ gr.Warning(f"An unexpected error occurred while loading the Tokenizer.")
46
+ gr.Warning(f"{error}")
47
+ return None, None, None
48
+
49
+ def tokenize_er(sequence):
50
+ try:
51
+ tokens = tokenizer.tokenize(sequence)
52
+ ids = tokenizer.convert_tokens_to_ids(tokens)
53
+ token_id_pair = []
54
+ if len(tokens) == len(ids):
55
+ for i in range(len(ids)):
56
+ token_id_pair.append([tokens[i],ids[i]])
57
+ return token_id_pair
58
+ except NameError:
59
+ gr.Warning("Load Tokenizer before sequencing.")
60
+ return [[None, None]]
61
+
62
+ def de_tokenize_er(pairs):
63
+ try:
64
+ tokens = []
65
+ ids = []
66
+ for row in pairs:
67
+ tokens.append(row[0])
68
+ try:
69
+ ids.append(int(row[1]))
70
+ except:
71
+ ids.append(0)
72
+ tokens_ids= tokenizer.convert_tokens_to_ids(tokens)
73
+ decoded_tokens = tokenizer.decode(tokens_ids)
74
+ decoded_ids = tokenizer.decode(ids)
75
+ return tokens_ids, decoded_tokens, decoded_ids
76
+ except NameError:
77
+ gr.Warning("Tokenize sequence before decoding.")
78
+ return None, None, None
79
+
80
+ with gr.Blocks() as frontend:
81
+ with gr.Row():
82
+ with gr.Column(scale=3):
83
+ gr.Markdown("# πŸ‡ Tokenizaminer\n\n### The Tokenizer Examiner... πŸ•΅οΈπŸ•³οΈ\n\nThe purpose of this tool is to examine the vocabulary and tokens of a models tokenizer and play with the results.\n\n## Instructions\n\n1. Load a tokenizer\n2. Type and Tokenize a sequence\n3. Manipulate it to see what happens!")
84
+ with gr.Group():
85
+ input_checkpoint = gr.Dropdown(label="1. Tokenizer", choices=checkpoints, value=checkpoint, allow_custom_value=True, info="Select from the list or enter any model from πŸ€— Hugging Face Models, it will only download the Tokenizer data! Image models won't work here.")
86
+ btn_load_tokenizer = gr.Button(value="Load Tokenizer")
87
+ with gr.Row():
88
+ input_sequence = gr.TextArea(label="2. Sequence", value=sequence, placeholder=placeholder, lines=3, interactive=True)
89
+ with gr.Row():
90
+ btn_tokenize = gr.Button(value="Tokenize!")
91
+ btn_random_seq = gr.Button(value="Randomize!")
92
+ with gr.Row():
93
+ token_id_pair = gr.DataFrame(label="3. Decode", col_count=(2,"fixed"), headers=["Token","ID"], type="array", datatype=["str", "number"], height=400, interactive=True)
94
+ with gr.Row():
95
+ btn_decode = gr.Button(value="Decode")
96
+ with gr.Row():
97
+ with gr.Column():
98
+ output_decoded_token_ids = gr.TextArea(label="Re-encoded Tokens", interactive=False)
99
+ output_decoded_tokens = gr.TextArea(label="Decoded Re-encoded Tokens", interactive=False)
100
+ with gr.Column():
101
+ output_decoded_ids = gr.TextArea(label="Decoded IDs", interactive=False)
102
+ with gr.Column(scale=1):
103
+ with gr.Group():
104
+ output_vocab_count = gr.Number(label="Vocab Size", interactive=False)
105
+ output_unknown_token = gr.Textbox(label="Unknown Token", interactive=False)
106
+ output_vocab = gr.Code(label="Vocabulary")
107
+
108
+ btn_load_tokenizer.click(fn=load_tokenizer, inputs=[input_checkpoint], outputs=[output_vocab_count,output_unknown_token, output_vocab])
109
+ btn_tokenize.click(fn=tokenize_er, inputs=[input_sequence], outputs=[token_id_pair])
110
+ btn_random_seq.click(fn=randomize_sequence, inputs=[], outputs=[input_sequence])
111
+ btn_decode.click(fn=de_tokenize_er, inputs=[token_id_pair], outputs=[output_decoded_token_ids,output_decoded_tokens, output_decoded_ids])
112
+
113
+ frontend.launch()