Spaces:

Taranosaurus
/

Tokenizaminer

Sleeping

App Files Files Community

Taranosaurus commited on Jan 3

Commit

7dae6b7

•

1 Parent(s): 47edf6c

Re-adjusting how the tokenizer and vocabulary loading

Browse files

Made it more reliable so your analysis gets loaded and processed more predictably

Files changed (1) hide show

app.py +44 -26

app.py CHANGED Viewed

@@ -2,9 +2,10 @@ from transformers import AutoTokenizer
 import gradio as gr
 import random
-checkpoint = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
 checkpoints = [
     checkpoint,
     "microsoft/phi-2",
     "openai/whisper-large-v3",
     "NousResearch/Nous-Hermes-2-Yi-34B",
@@ -27,43 +28,53 @@ def randomize_sequence():
 sequence = randomize_sequence
 def load_tokenizer(checkpoint):
     if not "tokenizer" in globals():
         global tokenizer
-        tokenizer = AutoTokenizer.from_pretrained(checkpoint)
-    try:
-        if checkpoint == tokenizer.name_or_path:
-            gr.Info(f"Tokenizer already loaded '{checkpoint}'")
-        else:
             tokenizer = AutoTokenizer.from_pretrained(checkpoint)
-        vocab = dict(sorted(tokenizer.vocab.items(), key=lambda item: item[1]))
-        unk = next(iter(vocab))
-        vocab.pop(unk)
-        vocab_sorted = "\n".join(vocab)
-        vocab_size = len(vocab)
-        gr.Info(f"Tokenizer vocab size: {vocab_size}")
-        return vocab_size, unk, vocab_sorted
-    except Exception as error:
-        gr.Warning(f"An unexpected error occurred while loading the Tokenizer.")
-        gr.Warning(f"{error}")
-        return None, None, None
 def tokenize_er(checkpoint, sequence):
-    vocab_size, unk, vocab_sorted = load_tokenizer(checkpoint)
     try:
         tokens = tokenizer.tokenize(sequence)
         ids = tokenizer.convert_tokens_to_ids(tokens)
         token_id_pair = []
         if len(tokens) == len(ids):
             for i in range(len(ids)):
                 token_id_pair.append([tokens[i],ids[i]])
-        return token_id_pair, vocab_size, unk, vocab_sorted
     except NameError:
         gr.Warning("Select Tokenizer before sequencing.")
-        return [[None, None]], None, None, None
-def de_tokenize_er(pairs):
     try:
         tokens = []
         ids = []
         for row in pairs:
@@ -79,15 +90,19 @@ def de_tokenize_er(pairs):
     except NameError:
         gr.Warning("Tokenize sequence before decoding.")
         return None, None, None
 with gr.Blocks() as frontend:
     with gr.Row():
         with gr.Column(scale=3):
-            gr.Markdown("# 🐇 Tokenizaminer\n### The Tokenizer Examiner, or the Tokeniza Miner... 🕵️🕳️\nThe purpose of this tool is to examine the vocabulary and tokens of a models tokenizer and play with the results.\nNote how the Vocabulary ID lines up with the full Vocabulary index on the right ➡️\n\n⚠️ Loading the vocabulary can take a few seconds.")
             with gr.Row():
                 gr.Markdown("\n#### 1. Select Tokenizer\nSelect from the list or enter any model from 🤗 Hugging Face Models, it will only download the Tokenizer data! Image models won't work here.")
-            with gr.Group():
                 input_checkpoint = gr.Dropdown(label="Tokenizer", choices=checkpoints, value=checkpoint, allow_custom_value=True, show_label=False, container=False)
             with gr.Row():
                 gr.Markdown("\n#### 2. Sequence & Tokenize")
             with gr.Row():
@@ -110,13 +125,16 @@ with gr.Blocks() as frontend:
                     output_decoded_ids = gr.TextArea(label="Decoded IDs", interactive=False)
         with gr.Column(scale=1):
             with gr.Group():
-                gr.Markdown("\n#### 🎲 Tokenizer Data")
                 output_vocab_count = gr.Number(label="Vocab Size", interactive=False)
                 output_unknown_token = gr.Textbox(label="Unknown Token", interactive=False)
                 output_vocab = gr.Code(label="Vocabulary IDs")
-        btn_tokenize.click(fn=tokenize_er, inputs=[input_checkpoint, input_sequence], outputs=[token_id_pair, output_vocab_count,output_unknown_token, output_vocab], queue=True)
         btn_random_seq.click(fn=randomize_sequence, inputs=[], outputs=[input_sequence])
-        btn_decode.click(fn=de_tokenize_er, inputs=[token_id_pair], outputs=[output_decoded_token_ids,output_decoded_tokens, output_decoded_ids])
 frontend.launch()

 import gradio as gr
 import random
+checkpoint = "dslim/bert-base-NER"
 checkpoints = [
     checkpoint,
+    "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
     "microsoft/phi-2",
     "openai/whisper-large-v3",
     "NousResearch/Nous-Hermes-2-Yi-34B",
 sequence = randomize_sequence
+def load_vocab(target_model, current_model):
+    checkpoint = target_model
+    if target_model == current_model:
+        gr.Info(f"Tokenizer already loaded: {checkpoint}")
+    else:
+        load_tokenizer(checkpoint)
+        gr.Info(f"Tokenizer loaded: {checkpoint}")
+    vocab = dict(sorted(tokenizer.vocab.items(), key=lambda item: item[1]))
+    unk = next(iter(vocab))
+    vocab.pop(unk)
+    vocab_sorted = "\n".join(vocab)
+    vocab_size = len(vocab)
+    gr.Info(f"Tokenizer vocab size: {vocab_size}")
+    return checkpoint, vocab_size, unk, vocab_sorted
 def load_tokenizer(checkpoint):
     if not "tokenizer" in globals():
         global tokenizer
+    if len(checkpoint) > 0:
+        try:
             tokenizer = AutoTokenizer.from_pretrained(checkpoint)
+        except Exception as error:
+            gr.Warning("Unexpected error!")
+            raise gr.Error(f"{error}")
+    else:
+        return ValueError("Tokenizer cannot be empty!")
 def tokenize_er(checkpoint, sequence):
     try:
+        load_tokenizer(checkpoint)
         tokens = tokenizer.tokenize(sequence)
         ids = tokenizer.convert_tokens_to_ids(tokens)
         token_id_pair = []
         if len(tokens) == len(ids):
             for i in range(len(ids)):
                 token_id_pair.append([tokens[i],ids[i]])
+        return token_id_pair
     except NameError:
         gr.Warning("Select Tokenizer before sequencing.")
+        return [[None, None]]
+    except Exception as error:
+        gr.Warning("Unexpected error!")
+        raise gr.Error(f"{error}")
+def de_tokenize_er(checkpoint, pairs):
     try:
+        load_tokenizer(checkpoint)
         tokens = []
         ids = []
         for row in pairs:
     except NameError:
         gr.Warning("Tokenize sequence before decoding.")
         return None, None, None
+    except Exception as error:
+        gr.Warning("Unexpected error!")
+        raise gr.Error(f"{error}")
 with gr.Blocks() as frontend:
     with gr.Row():
         with gr.Column(scale=3):
+            gr.Markdown("# 🐇 Tokenizaminer\n### The Tokenizer Examiner, or the Tokeniza Miner... 🕵️🕳️\nThe purpose of this tool is to examine the vocabulary and tokens of a models tokenizer and play with the results.\nNote how the Vocabulary ID lines up with the full Vocabulary index on the right ➡️\n\n⚠️ Loading the full vocabulary can take a few seconds and the browser might stutter.")
             with gr.Row():
                 gr.Markdown("\n#### 1. Select Tokenizer\nSelect from the list or enter any model from 🤗 Hugging Face Models, it will only download the Tokenizer data! Image models won't work here.")
+            with gr.Row():
                 input_checkpoint = gr.Dropdown(label="Tokenizer", choices=checkpoints, value=checkpoint, allow_custom_value=True, show_label=False, container=False)
+                #btn_load_vocab = gr.Button(value="Load Vocabulary")
             with gr.Row():
                 gr.Markdown("\n#### 2. Sequence & Tokenize")
             with gr.Row():
                     output_decoded_ids = gr.TextArea(label="Decoded IDs", interactive=False)
         with gr.Column(scale=1):
             with gr.Group():
+                gr.Markdown("###  🎲 Tokenizer Data")
+                output_checkpoint = gr.Textbox(visible=False)
                 output_vocab_count = gr.Number(label="Vocab Size", interactive=False)
                 output_unknown_token = gr.Textbox(label="Unknown Token", interactive=False)
                 output_vocab = gr.Code(label="Vocabulary IDs")
+        input_checkpoint.change(fn=load_vocab, inputs=[input_checkpoint, output_checkpoint], outputs=[output_checkpoint, output_vocab_count, output_unknown_token, output_vocab], queue=True)
+        btn_tokenize.click(fn=tokenize_er, inputs=[input_checkpoint, input_sequence], outputs=[token_id_pair], queue=True)
         btn_random_seq.click(fn=randomize_sequence, inputs=[], outputs=[input_sequence])
+        btn_decode.click(fn=de_tokenize_er, inputs=[input_checkpoint, token_id_pair], outputs=[output_decoded_token_ids,output_decoded_tokens, output_decoded_ids], queue=True)
+    frontend.load(fn=load_vocab, inputs=[input_checkpoint, output_checkpoint], outputs=[output_checkpoint, output_vocab_count, output_unknown_token, output_vocab], queue=True)
 frontend.launch()