Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files
app.py
CHANGED
@@ -9,14 +9,18 @@ tk.load('dune-20256.model')
|
|
9 |
et = time.time()
|
10 |
print(f'Model loaded. Took {et-st} seconds.')
|
11 |
|
12 |
-
def tokenize(text):
|
13 |
tokens = tk.encode(text)
|
14 |
|
15 |
colors = ['rgba(107,64,216,.3)', 'rgba(104,222,122,.4)', 'rgba(244,172,54,.4)', 'rgba(239,65,70,.4)', 'rgba(39,181,234,.4)']
|
16 |
colored_tokens = []
|
17 |
|
18 |
for i, token in enumerate(tokens):
|
19 |
-
|
|
|
|
|
|
|
|
|
20 |
span = f'<span style="background-color: {colors[i % len(colors)]}">{token}</span>'
|
21 |
colored_tokens.append(span)
|
22 |
|
@@ -24,7 +28,10 @@ def tokenize(text):
|
|
24 |
|
25 |
interface = gr.Interface(
|
26 |
fn=tokenize,
|
27 |
-
inputs=[
|
|
|
|
|
|
|
28 |
outputs=[
|
29 |
gr.HTML(label='Tokenized Text'),
|
30 |
gr.Textbox(label='Token IDs', lines=1, max_lines=5),
|
@@ -34,8 +41,9 @@ interface = gr.Interface(
|
|
34 |
title="BPE Tokenization Visualizer",
|
35 |
live=True,
|
36 |
examples=[
|
37 |
-
'BPE, or Byte Pair Encoding, is a method used to compress text by breaking it down into smaller units. In natural language processing, it helps tokenize words by merging the most frequent pairs of characters or symbols, creating more efficient and manageable tokens for analysis.',
|
38 |
-
'This custom BPE tokenizer model was trained on the entire text of the novel Dune by Frank Herbert and has a vocabulary size of 20,256, which corresponds to the 256 bytes base tokens and the symbols learned with 20,000 merges.'
|
|
|
39 |
],
|
40 |
show_progress='hidden',
|
41 |
api_name='tokenize',
|
|
|
9 |
et = time.time()
|
10 |
print(f'Model loaded. Took {et-st} seconds.')
|
11 |
|
12 |
+
def tokenize(text, checked):
|
13 |
tokens = tk.encode(text)
|
14 |
|
15 |
colors = ['rgba(107,64,216,.3)', 'rgba(104,222,122,.4)', 'rgba(244,172,54,.4)', 'rgba(239,65,70,.4)', 'rgba(39,181,234,.4)']
|
16 |
colored_tokens = []
|
17 |
|
18 |
for i, token in enumerate(tokens):
|
19 |
+
if checked:
|
20 |
+
ws_val = '⋅'
|
21 |
+
else:
|
22 |
+
ws_val = ' '
|
23 |
+
token = tk.vocab[token].decode('utf-8').replace(' ', ws_val)
|
24 |
span = f'<span style="background-color: {colors[i % len(colors)]}">{token}</span>'
|
25 |
colored_tokens.append(span)
|
26 |
|
|
|
28 |
|
29 |
interface = gr.Interface(
|
30 |
fn=tokenize,
|
31 |
+
inputs=[
|
32 |
+
gr.TextArea(label='Input Text', type='text'),
|
33 |
+
gr.Checkbox(label='Show whitespace')
|
34 |
+
],
|
35 |
outputs=[
|
36 |
gr.HTML(label='Tokenized Text'),
|
37 |
gr.Textbox(label='Token IDs', lines=1, max_lines=5),
|
|
|
41 |
title="BPE Tokenization Visualizer",
|
42 |
live=True,
|
43 |
examples=[
|
44 |
+
['BPE, or Byte Pair Encoding, is a method used to compress text by breaking it down into smaller units. In natural language processing, it helps tokenize words by merging the most frequent pairs of characters or symbols, creating more efficient and manageable tokens for analysis.', False],
|
45 |
+
['This custom BPE tokenizer model was trained on the entire text of the novel Dune by Frank Herbert and has a vocabulary size of 20,256, which corresponds to the 256 bytes base tokens and the symbols learned with 20,000 merges.', False],
|
46 |
+
['The spice must flow, Paul. Without it, the Fremen will never rise, and the sands will consume us all.', False]
|
47 |
],
|
48 |
show_progress='hidden',
|
49 |
api_name='tokenize',
|