Taranosaurus commited on
Commit
ce49ae8
β€’
1 Parent(s): 2488d19

Refactored how the model gets loaded

Browse files

Removed the model load button and added queueing to the Tokenize button for better user experience

Files changed (1) hide show
  1. app.py +20 -18
app.py CHANGED
@@ -30,23 +30,26 @@ sequence = randomize_sequence
30
  def load_tokenizer(checkpoint):
31
  if not "tokenizer" in globals():
32
  global tokenizer
33
- tokenizer = None
34
- try:
35
  tokenizer = AutoTokenizer.from_pretrained(checkpoint)
 
 
 
 
 
36
  vocab = dict(sorted(tokenizer.vocab.items(), key=lambda item: item[1]))
37
  unk = next(iter(vocab))
38
  vocab.pop(unk)
39
  vocab_sorted = "\n".join(vocab)
40
  vocab_size = len(vocab)
41
- gr.Info(f"Tokenizer loaded '{checkpoint}' with vocab size: {vocab_size}")
42
- #return checkpoint, vocab_size, vocab
43
  return vocab_size, unk, vocab_sorted
44
  except Exception as error:
45
  gr.Warning(f"An unexpected error occurred while loading the Tokenizer.")
46
  gr.Warning(f"{error}")
47
  return None, None, None
48
 
49
- def tokenize_er(sequence):
 
50
  try:
51
  tokens = tokenizer.tokenize(sequence)
52
  ids = tokenizer.convert_tokens_to_ids(tokens)
@@ -54,10 +57,10 @@ def tokenize_er(sequence):
54
  if len(tokens) == len(ids):
55
  for i in range(len(ids)):
56
  token_id_pair.append([tokens[i],ids[i]])
57
- return token_id_pair
58
  except NameError:
59
- gr.Warning("Load Tokenizer before sequencing.")
60
- return [[None, None]]
61
 
62
  def de_tokenize_er(pairs):
63
  try:
@@ -80,25 +83,25 @@ def de_tokenize_er(pairs):
80
  with gr.Blocks() as frontend:
81
  with gr.Row():
82
  with gr.Column(scale=3):
83
- gr.Markdown("# πŸ‡ Tokenizaminer\n### The Tokenizer Examiner... πŸ•΅οΈπŸ•³οΈ\nThe purpose of this tool is to examine the vocabulary and tokens of a models tokenizer and play with the results.")
84
  with gr.Row():
85
- gr.Markdown("\n#### 1. Load Tokenizer\nSelect from the list or enter any model from πŸ€— Hugging Face Models, it will only download the Tokenizer data! Image models won't work here.")
86
  with gr.Group():
87
- input_checkpoint = gr.Dropdown(choices=checkpoints, value=checkpoint, allow_custom_value=True, container=False)
88
- btn_load_tokenizer = gr.Button(value="Load Tokenizer")
89
  with gr.Row():
90
  gr.Markdown("\n#### 2. Sequence & Tokenize")
91
  with gr.Row():
92
- input_sequence = gr.TextArea(value=sequence, placeholder=placeholder, lines=3, interactive=True, container=False)
93
  with gr.Row():
94
  btn_tokenize = gr.Button(value="Tokenize!")
95
  btn_random_seq = gr.Button(value="Randomize!")
96
  with gr.Row():
97
  gr.Markdown("\n#### 3. Decode\nYou can select and edit each cell individually - then hit Decode!")
98
  with gr.Row():
99
- token_id_pair = gr.DataFrame(col_count=(2,"fixed"), headers=["Token","ID"], type="array", datatype=["str", "number"], height=400, interactive=True)
100
  with gr.Row():
101
  btn_decode = gr.Button(value="Decode")
 
102
  with gr.Row():
103
  with gr.Column():
104
  output_decoded_token_ids = gr.TextArea(label="Re-encoded Tokens", interactive=False)
@@ -107,13 +110,12 @@ with gr.Blocks() as frontend:
107
  output_decoded_ids = gr.TextArea(label="Decoded IDs", interactive=False)
108
  with gr.Column(scale=1):
109
  with gr.Group():
110
- gr.Markdown("\n#### Tokenizer Data")
111
  output_vocab_count = gr.Number(label="Vocab Size", interactive=False)
112
  output_unknown_token = gr.Textbox(label="Unknown Token", interactive=False)
113
- output_vocab = gr.Code(label="Vocabulary")
114
 
115
- btn_load_tokenizer.click(fn=load_tokenizer, inputs=[input_checkpoint], outputs=[output_vocab_count,output_unknown_token, output_vocab])
116
- btn_tokenize.click(fn=tokenize_er, inputs=[input_sequence], outputs=[token_id_pair])
117
  btn_random_seq.click(fn=randomize_sequence, inputs=[], outputs=[input_sequence])
118
  btn_decode.click(fn=de_tokenize_er, inputs=[token_id_pair], outputs=[output_decoded_token_ids,output_decoded_tokens, output_decoded_ids])
119
 
 
30
  def load_tokenizer(checkpoint):
31
  if not "tokenizer" in globals():
32
  global tokenizer
 
 
33
  tokenizer = AutoTokenizer.from_pretrained(checkpoint)
34
+ try:
35
+ if checkpoint == tokenizer.name_or_path:
36
+ gr.Info(f"Tokenizer already loaded '{checkpoint}'")
37
+ else:
38
+ tokenizer = AutoTokenizer.from_pretrained(checkpoint)
39
  vocab = dict(sorted(tokenizer.vocab.items(), key=lambda item: item[1]))
40
  unk = next(iter(vocab))
41
  vocab.pop(unk)
42
  vocab_sorted = "\n".join(vocab)
43
  vocab_size = len(vocab)
44
+ gr.Info(f"Tokenizer vocab size: {vocab_size}")
 
45
  return vocab_size, unk, vocab_sorted
46
  except Exception as error:
47
  gr.Warning(f"An unexpected error occurred while loading the Tokenizer.")
48
  gr.Warning(f"{error}")
49
  return None, None, None
50
 
51
+ def tokenize_er(checkpoint, sequence):
52
+ vocab_size, unk, vocab_sorted = load_tokenizer(checkpoint)
53
  try:
54
  tokens = tokenizer.tokenize(sequence)
55
  ids = tokenizer.convert_tokens_to_ids(tokens)
 
57
  if len(tokens) == len(ids):
58
  for i in range(len(ids)):
59
  token_id_pair.append([tokens[i],ids[i]])
60
+ return token_id_pair, vocab_size, unk, vocab_sorted
61
  except NameError:
62
+ gr.Warning("Select Tokenizer before sequencing.")
63
+ return [[None, None]], None, None, None
64
 
65
  def de_tokenize_er(pairs):
66
  try:
 
83
  with gr.Blocks() as frontend:
84
  with gr.Row():
85
  with gr.Column(scale=3):
86
+ gr.Markdown("# πŸ‡ Tokenizaminer\n### The Tokenizer Examiner, or the Tokeniza Miner... πŸ•΅οΈπŸ•³οΈ\nThe purpose of this tool is to examine the vocabulary and tokens of a models tokenizer and play with the results.\nNote how the Vocabulary ID lines up with the full Vocabulary index on the right ➑️\n\n⚠️ Loading the vocabulary can take a few seconds.")
87
  with gr.Row():
88
+ gr.Markdown("\n#### 1. Select Tokenizer\nSelect from the list or enter any model from πŸ€— Hugging Face Models, it will only download the Tokenizer data! Image models won't work here.")
89
  with gr.Group():
90
+ input_checkpoint = gr.Dropdown(label="Tokenizer", choices=checkpoints, value=checkpoint, allow_custom_value=True, show_label=False, container=False)
 
91
  with gr.Row():
92
  gr.Markdown("\n#### 2. Sequence & Tokenize")
93
  with gr.Row():
94
+ input_sequence = gr.TextArea(label="Sequence", value=sequence, placeholder=placeholder, lines=3, interactive=True, show_label=False, container=False)
95
  with gr.Row():
96
  btn_tokenize = gr.Button(value="Tokenize!")
97
  btn_random_seq = gr.Button(value="Randomize!")
98
  with gr.Row():
99
  gr.Markdown("\n#### 3. Decode\nYou can select and edit each cell individually - then hit Decode!")
100
  with gr.Row():
101
+ token_id_pair = gr.DataFrame(col_count=(2,"fixed"), headers=["Token","Vocabulary ID"], value=[[None,0]], type="array", datatype=["str", "number"], height=400, interactive=True)
102
  with gr.Row():
103
  btn_decode = gr.Button(value="Decode")
104
+ btn_clear_pairs = gr.ClearButton(value="Clear Token/IDs", components=[token_id_pair])
105
  with gr.Row():
106
  with gr.Column():
107
  output_decoded_token_ids = gr.TextArea(label="Re-encoded Tokens", interactive=False)
 
110
  output_decoded_ids = gr.TextArea(label="Decoded IDs", interactive=False)
111
  with gr.Column(scale=1):
112
  with gr.Group():
113
+ gr.Markdown("\n#### 🎲 Tokenizer Data")
114
  output_vocab_count = gr.Number(label="Vocab Size", interactive=False)
115
  output_unknown_token = gr.Textbox(label="Unknown Token", interactive=False)
116
+ output_vocab = gr.Code(label="Vocabulary IDs")
117
 
118
+ btn_tokenize.click(fn=tokenize_er, inputs=[input_checkpoint, input_sequence], outputs=[token_id_pair, output_vocab_count,output_unknown_token, output_vocab], queue=True)
 
119
  btn_random_seq.click(fn=randomize_sequence, inputs=[], outputs=[input_sequence])
120
  btn_decode.click(fn=de_tokenize_er, inputs=[token_id_pair], outputs=[output_decoded_token_ids,output_decoded_tokens, output_decoded_ids])
121