Taranosaurus commited on
Commit
507d429
β€’
1 Parent(s): 7dae6b7

Changed "Unknown Token" token to "Token 0"

Browse files

Some models have the unknown (UNK) token at a non-zero index in the vocab

Files changed (1) hide show
  1. app.py +3 -3
app.py CHANGED
@@ -128,13 +128,13 @@ with gr.Blocks() as frontend:
128
  gr.Markdown("### 🎲 Tokenizer Data")
129
  output_checkpoint = gr.Textbox(visible=False)
130
  output_vocab_count = gr.Number(label="Vocab Size", interactive=False)
131
- output_unknown_token = gr.Textbox(label="Unknown Token", interactive=False)
132
  output_vocab = gr.Code(label="Vocabulary IDs")
133
 
134
- input_checkpoint.change(fn=load_vocab, inputs=[input_checkpoint, output_checkpoint], outputs=[output_checkpoint, output_vocab_count, output_unknown_token, output_vocab], queue=True)
135
  btn_tokenize.click(fn=tokenize_er, inputs=[input_checkpoint, input_sequence], outputs=[token_id_pair], queue=True)
136
  btn_random_seq.click(fn=randomize_sequence, inputs=[], outputs=[input_sequence])
137
  btn_decode.click(fn=de_tokenize_er, inputs=[input_checkpoint, token_id_pair], outputs=[output_decoded_token_ids,output_decoded_tokens, output_decoded_ids], queue=True)
138
- frontend.load(fn=load_vocab, inputs=[input_checkpoint, output_checkpoint], outputs=[output_checkpoint, output_vocab_count, output_unknown_token, output_vocab], queue=True)
139
 
140
  frontend.launch()
 
128
  gr.Markdown("### 🎲 Tokenizer Data")
129
  output_checkpoint = gr.Textbox(visible=False)
130
  output_vocab_count = gr.Number(label="Vocab Size", interactive=False)
131
+ output_token_zero = gr.Textbox(label="Token 0", interactive=False)
132
  output_vocab = gr.Code(label="Vocabulary IDs")
133
 
134
+ input_checkpoint.change(fn=load_vocab, inputs=[input_checkpoint, output_checkpoint], outputs=[output_checkpoint, output_vocab_count, output_token_zero, output_vocab], queue=True)
135
  btn_tokenize.click(fn=tokenize_er, inputs=[input_checkpoint, input_sequence], outputs=[token_id_pair], queue=True)
136
  btn_random_seq.click(fn=randomize_sequence, inputs=[], outputs=[input_sequence])
137
  btn_decode.click(fn=de_tokenize_er, inputs=[input_checkpoint, token_id_pair], outputs=[output_decoded_token_ids,output_decoded_tokens, output_decoded_ids], queue=True)
138
+ frontend.load(fn=load_vocab, inputs=[input_checkpoint, output_checkpoint], outputs=[output_checkpoint, output_vocab_count, output_token_zero, output_vocab], queue=True)
139
 
140
  frontend.launch()