Spaces:
Sleeping
Sleeping
File size: 1,589 Bytes
0097326 9e2a2bb 0097326 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 |
import gradio as gr
from xsbpe.basic import BasicTokenizer
tk = BasicTokenizer()
print('Tokenizer initialized.')
tk.train(open('dune.txt').read(), 256 + 10, verbose=False)
print('Training complete.')
def tokenize(text):
tokens = tk.encode(text)
colors = ['rgba(107,64,216,.3)', 'rgba(104,222,122,.4)', 'rgba(244,172,54,.4)', 'rgba(239,65,70,.4)', 'rgba(39,181,234,.4)']
colored_tokens = []
for i, token in enumerate(tokens):
token = tk.vocab[token].decode('utf-8').replace(' ', ' ')
span = f'<span style="background-color: {colors[i % len(colors)]}">{token}</span>'
colored_tokens.append(span)
return '<p style="margin-left: 2px; margin-right: 2px; word-wrap: break-word">' + ''.join(colored_tokens) + '</p>', tokens, len(tokens), len(text)
interface = gr.Interface(
fn=tokenize,
inputs=[gr.TextArea(label='Input Text', type='text')],
outputs=[
gr.HTML(label='Tokenized Text'),
gr.Textbox(label='Token IDs', lines=1, max_lines=5),
gr.Textbox(label='Tokens', max_lines=1),
gr.Textbox(label='Characters', max_lines=1)
],
title="BPE Tokenization Visualizer",
live=True,
examples=[
'BPE, or Byte Pair Encoding, is a method used to compress text by breaking it down into smaller units. In natural language processing, it helps tokenize words by merging the most frequent pairs of characters or symbols, creating more efficient and manageable tokens for analysis.'
],
show_progress='hidden',
api_name='tokenize',
allow_flagging='never'
).launch() |