Taranosaurus commited on
Commit
7dae6b7
β€’
1 Parent(s): 47edf6c

Re-adjusting how the tokenizer and vocabulary loading

Browse files

Made it more reliable so your analysis gets loaded and processed more predictably

Files changed (1) hide show
  1. app.py +44 -26
app.py CHANGED
@@ -2,9 +2,10 @@ from transformers import AutoTokenizer
2
  import gradio as gr
3
  import random
4
 
5
- checkpoint = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
6
  checkpoints = [
7
  checkpoint,
 
8
  "microsoft/phi-2",
9
  "openai/whisper-large-v3",
10
  "NousResearch/Nous-Hermes-2-Yi-34B",
@@ -27,43 +28,53 @@ def randomize_sequence():
27
 
28
  sequence = randomize_sequence
29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  def load_tokenizer(checkpoint):
31
  if not "tokenizer" in globals():
32
  global tokenizer
33
- tokenizer = AutoTokenizer.from_pretrained(checkpoint)
34
- try:
35
- if checkpoint == tokenizer.name_or_path:
36
- gr.Info(f"Tokenizer already loaded '{checkpoint}'")
37
- else:
38
  tokenizer = AutoTokenizer.from_pretrained(checkpoint)
39
- vocab = dict(sorted(tokenizer.vocab.items(), key=lambda item: item[1]))
40
- unk = next(iter(vocab))
41
- vocab.pop(unk)
42
- vocab_sorted = "\n".join(vocab)
43
- vocab_size = len(vocab)
44
- gr.Info(f"Tokenizer vocab size: {vocab_size}")
45
- return vocab_size, unk, vocab_sorted
46
- except Exception as error:
47
- gr.Warning(f"An unexpected error occurred while loading the Tokenizer.")
48
- gr.Warning(f"{error}")
49
- return None, None, None
50
 
51
  def tokenize_er(checkpoint, sequence):
52
- vocab_size, unk, vocab_sorted = load_tokenizer(checkpoint)
53
  try:
 
54
  tokens = tokenizer.tokenize(sequence)
55
  ids = tokenizer.convert_tokens_to_ids(tokens)
56
  token_id_pair = []
57
  if len(tokens) == len(ids):
58
  for i in range(len(ids)):
59
  token_id_pair.append([tokens[i],ids[i]])
60
- return token_id_pair, vocab_size, unk, vocab_sorted
61
  except NameError:
62
  gr.Warning("Select Tokenizer before sequencing.")
63
- return [[None, None]], None, None, None
 
 
 
64
 
65
- def de_tokenize_er(pairs):
66
  try:
 
67
  tokens = []
68
  ids = []
69
  for row in pairs:
@@ -79,15 +90,19 @@ def de_tokenize_er(pairs):
79
  except NameError:
80
  gr.Warning("Tokenize sequence before decoding.")
81
  return None, None, None
 
 
 
82
 
83
  with gr.Blocks() as frontend:
84
  with gr.Row():
85
  with gr.Column(scale=3):
86
- gr.Markdown("# πŸ‡ Tokenizaminer\n### The Tokenizer Examiner, or the Tokeniza Miner... πŸ•΅οΈπŸ•³οΈ\nThe purpose of this tool is to examine the vocabulary and tokens of a models tokenizer and play with the results.\nNote how the Vocabulary ID lines up with the full Vocabulary index on the right ➑️\n\n⚠️ Loading the vocabulary can take a few seconds.")
87
  with gr.Row():
88
  gr.Markdown("\n#### 1. Select Tokenizer\nSelect from the list or enter any model from πŸ€— Hugging Face Models, it will only download the Tokenizer data! Image models won't work here.")
89
- with gr.Group():
90
  input_checkpoint = gr.Dropdown(label="Tokenizer", choices=checkpoints, value=checkpoint, allow_custom_value=True, show_label=False, container=False)
 
91
  with gr.Row():
92
  gr.Markdown("\n#### 2. Sequence & Tokenize")
93
  with gr.Row():
@@ -110,13 +125,16 @@ with gr.Blocks() as frontend:
110
  output_decoded_ids = gr.TextArea(label="Decoded IDs", interactive=False)
111
  with gr.Column(scale=1):
112
  with gr.Group():
113
- gr.Markdown("\n#### 🎲 Tokenizer Data")
 
114
  output_vocab_count = gr.Number(label="Vocab Size", interactive=False)
115
  output_unknown_token = gr.Textbox(label="Unknown Token", interactive=False)
116
  output_vocab = gr.Code(label="Vocabulary IDs")
117
 
118
- btn_tokenize.click(fn=tokenize_er, inputs=[input_checkpoint, input_sequence], outputs=[token_id_pair, output_vocab_count,output_unknown_token, output_vocab], queue=True)
 
119
  btn_random_seq.click(fn=randomize_sequence, inputs=[], outputs=[input_sequence])
120
- btn_decode.click(fn=de_tokenize_er, inputs=[token_id_pair], outputs=[output_decoded_token_ids,output_decoded_tokens, output_decoded_ids])
 
121
 
122
  frontend.launch()
 
2
  import gradio as gr
3
  import random
4
 
5
+ checkpoint = "dslim/bert-base-NER"
6
  checkpoints = [
7
  checkpoint,
8
+ "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
9
  "microsoft/phi-2",
10
  "openai/whisper-large-v3",
11
  "NousResearch/Nous-Hermes-2-Yi-34B",
 
28
 
29
  sequence = randomize_sequence
30
 
31
+ def load_vocab(target_model, current_model):
32
+ checkpoint = target_model
33
+ if target_model == current_model:
34
+ gr.Info(f"Tokenizer already loaded: {checkpoint}")
35
+ else:
36
+ load_tokenizer(checkpoint)
37
+ gr.Info(f"Tokenizer loaded: {checkpoint}")
38
+ vocab = dict(sorted(tokenizer.vocab.items(), key=lambda item: item[1]))
39
+ unk = next(iter(vocab))
40
+ vocab.pop(unk)
41
+ vocab_sorted = "\n".join(vocab)
42
+ vocab_size = len(vocab)
43
+ gr.Info(f"Tokenizer vocab size: {vocab_size}")
44
+ return checkpoint, vocab_size, unk, vocab_sorted
45
+
46
  def load_tokenizer(checkpoint):
47
  if not "tokenizer" in globals():
48
  global tokenizer
49
+ if len(checkpoint) > 0:
50
+ try:
 
 
 
51
  tokenizer = AutoTokenizer.from_pretrained(checkpoint)
52
+ except Exception as error:
53
+ gr.Warning("Unexpected error!")
54
+ raise gr.Error(f"{error}")
55
+ else:
56
+ return ValueError("Tokenizer cannot be empty!")
 
 
 
 
 
 
57
 
58
  def tokenize_er(checkpoint, sequence):
 
59
  try:
60
+ load_tokenizer(checkpoint)
61
  tokens = tokenizer.tokenize(sequence)
62
  ids = tokenizer.convert_tokens_to_ids(tokens)
63
  token_id_pair = []
64
  if len(tokens) == len(ids):
65
  for i in range(len(ids)):
66
  token_id_pair.append([tokens[i],ids[i]])
67
+ return token_id_pair
68
  except NameError:
69
  gr.Warning("Select Tokenizer before sequencing.")
70
+ return [[None, None]]
71
+ except Exception as error:
72
+ gr.Warning("Unexpected error!")
73
+ raise gr.Error(f"{error}")
74
 
75
+ def de_tokenize_er(checkpoint, pairs):
76
  try:
77
+ load_tokenizer(checkpoint)
78
  tokens = []
79
  ids = []
80
  for row in pairs:
 
90
  except NameError:
91
  gr.Warning("Tokenize sequence before decoding.")
92
  return None, None, None
93
+ except Exception as error:
94
+ gr.Warning("Unexpected error!")
95
+ raise gr.Error(f"{error}")
96
 
97
  with gr.Blocks() as frontend:
98
  with gr.Row():
99
  with gr.Column(scale=3):
100
+ gr.Markdown("# πŸ‡ Tokenizaminer\n### The Tokenizer Examiner, or the Tokeniza Miner... πŸ•΅οΈπŸ•³οΈ\nThe purpose of this tool is to examine the vocabulary and tokens of a models tokenizer and play with the results.\nNote how the Vocabulary ID lines up with the full Vocabulary index on the right ➑️\n\n⚠️ Loading the full vocabulary can take a few seconds and the browser might stutter.")
101
  with gr.Row():
102
  gr.Markdown("\n#### 1. Select Tokenizer\nSelect from the list or enter any model from πŸ€— Hugging Face Models, it will only download the Tokenizer data! Image models won't work here.")
103
+ with gr.Row():
104
  input_checkpoint = gr.Dropdown(label="Tokenizer", choices=checkpoints, value=checkpoint, allow_custom_value=True, show_label=False, container=False)
105
+ #btn_load_vocab = gr.Button(value="Load Vocabulary")
106
  with gr.Row():
107
  gr.Markdown("\n#### 2. Sequence & Tokenize")
108
  with gr.Row():
 
125
  output_decoded_ids = gr.TextArea(label="Decoded IDs", interactive=False)
126
  with gr.Column(scale=1):
127
  with gr.Group():
128
+ gr.Markdown("### 🎲 Tokenizer Data")
129
+ output_checkpoint = gr.Textbox(visible=False)
130
  output_vocab_count = gr.Number(label="Vocab Size", interactive=False)
131
  output_unknown_token = gr.Textbox(label="Unknown Token", interactive=False)
132
  output_vocab = gr.Code(label="Vocabulary IDs")
133
 
134
+ input_checkpoint.change(fn=load_vocab, inputs=[input_checkpoint, output_checkpoint], outputs=[output_checkpoint, output_vocab_count, output_unknown_token, output_vocab], queue=True)
135
+ btn_tokenize.click(fn=tokenize_er, inputs=[input_checkpoint, input_sequence], outputs=[token_id_pair], queue=True)
136
  btn_random_seq.click(fn=randomize_sequence, inputs=[], outputs=[input_sequence])
137
+ btn_decode.click(fn=de_tokenize_er, inputs=[input_checkpoint, token_id_pair], outputs=[output_decoded_token_ids,output_decoded_tokens, output_decoded_ids], queue=True)
138
+ frontend.load(fn=load_vocab, inputs=[input_checkpoint, output_checkpoint], outputs=[output_checkpoint, output_vocab_count, output_unknown_token, output_vocab], queue=True)
139
 
140
  frontend.launch()