Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files- .gitignore +3 -1
- app.py +7 -3
- dune-20256.model +3 -0
- dune-20256.vocab +0 -0
- xsbpe/base.py +47 -1
.gitignore
CHANGED
@@ -1 +1,3 @@
|
|
1 |
-
venv
|
|
|
|
|
|
1 |
+
venv
|
2 |
+
*ipynb
|
3 |
+
__pycache__
|
app.py
CHANGED
@@ -1,10 +1,13 @@
|
|
|
|
1 |
import gradio as gr
|
2 |
from xsbpe.basic import BasicTokenizer
|
3 |
|
4 |
tk = BasicTokenizer()
|
5 |
print('Tokenizer initialized.')
|
6 |
-
|
7 |
-
|
|
|
|
|
8 |
|
9 |
def tokenize(text):
|
10 |
tokens = tk.encode(text)
|
@@ -31,7 +34,8 @@ interface = gr.Interface(
|
|
31 |
title="BPE Tokenization Visualizer",
|
32 |
live=True,
|
33 |
examples=[
|
34 |
-
'BPE, or Byte Pair Encoding, is a method used to compress text by breaking it down into smaller units. In natural language processing, it helps tokenize words by merging the most frequent pairs of characters or symbols, creating more efficient and manageable tokens for analysis.'
|
|
|
35 |
],
|
36 |
show_progress='hidden',
|
37 |
api_name='tokenize',
|
|
|
1 |
+
import time
|
2 |
import gradio as gr
|
3 |
from xsbpe.basic import BasicTokenizer
|
4 |
|
5 |
tk = BasicTokenizer()
|
6 |
print('Tokenizer initialized.')
|
7 |
+
st = time.time()
|
8 |
+
tk.load('dune-20256.model')
|
9 |
+
et = time.time()
|
10 |
+
print(f'Model loaded. Took {et-st} seconds.')
|
11 |
|
12 |
def tokenize(text):
|
13 |
tokens = tk.encode(text)
|
|
|
34 |
title="BPE Tokenization Visualizer",
|
35 |
live=True,
|
36 |
examples=[
|
37 |
+
'BPE, or Byte Pair Encoding, is a method used to compress text by breaking it down into smaller units. In natural language processing, it helps tokenize words by merging the most frequent pairs of characters or symbols, creating more efficient and manageable tokens for analysis.',
|
38 |
+
'This custom BPE tokenizer model was trained on the entire text of the novel Dune by Frank Herbert and has a vocabulary size of 20,256, which corresponds to the 256 bytes base tokens and the symbols learned with 20,000 merges.'
|
39 |
],
|
40 |
show_progress='hidden',
|
41 |
api_name='tokenize',
|
dune-20256.model
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5aceeff36b589168f40e87d22befcc69879312882c3f8c16f4df412bd9c95cef
|
3 |
+
size 174006
|
dune-20256.vocab
ADDED
The diff for this file is too large to render.
See raw diff
|
|
xsbpe/base.py
CHANGED
@@ -60,4 +60,50 @@ class Tokenizer:
|
|
60 |
vocab[idx] = vocab[p0] + vocab[p1]
|
61 |
for special, idx in self.special_tokens.items():
|
62 |
vocab[idx] = special.encode('utf-8')
|
63 |
-
return vocab
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
60 |
vocab[idx] = vocab[p0] + vocab[p1]
|
61 |
for special, idx in self.special_tokens.items():
|
62 |
vocab[idx] = special.encode('utf-8')
|
63 |
+
return vocab
|
64 |
+
|
65 |
+
def save(self, file_prefix):
|
66 |
+
# Similar to sentencepiece model saving
|
67 |
+
model_file = file_prefix + '.model'
|
68 |
+
with open(model_file, 'w') as f:
|
69 |
+
f.write('xsgpe v1\n')
|
70 |
+
f.write(f'{self.pattern}\n')
|
71 |
+
f.write(f'{len(self.special_tokens)}\n')
|
72 |
+
for special, idx in self.special_tokens.items():
|
73 |
+
f.write(f'{special} {idx}\n')
|
74 |
+
for idx1, idx2 in self.merges:
|
75 |
+
f.write(f'{idx1} {idx2}\n')
|
76 |
+
# vocab file meant for human inspection only
|
77 |
+
vocab_file = file_prefix + '.vocab'
|
78 |
+
inverted_merges = {idx: pair for pair, idx in self.merges.items()}
|
79 |
+
with open(vocab_file, 'w', encoding='utf-8') as f:
|
80 |
+
for idx, token in self.vocab.items():
|
81 |
+
s = render_token(token)
|
82 |
+
if idx in inverted_merges:
|
83 |
+
idx0, idx1 = inverted_merges[idx]
|
84 |
+
s0 = render_token(self.vocab[idx0])
|
85 |
+
s1 = render_token(self.vocab[idx1])
|
86 |
+
f.write(f'[{s0}][{s1}] -> [{s}] {idx}\n')
|
87 |
+
else:
|
88 |
+
f.write(f'[{s}] {idx} \n')
|
89 |
+
|
90 |
+
def load(self, model_file):
|
91 |
+
assert model_file.endswith('.model')
|
92 |
+
merges = {}
|
93 |
+
special_tokens = {}
|
94 |
+
idx = 256
|
95 |
+
with open(model_file, 'r', encoding='utf-8') as f:
|
96 |
+
version = f.readline().strip()
|
97 |
+
assert version == 'xsbpe v1'
|
98 |
+
self.pattern = f.readline().strip()
|
99 |
+
num_special = int(f.readline().strip())
|
100 |
+
for _ in range(num_special):
|
101 |
+
special, special_idx = f.readline().strip().split()
|
102 |
+
special_tokens[special] = int(special_idx)
|
103 |
+
for line in f:
|
104 |
+
idx1, idx2 = map(int, line.split())
|
105 |
+
merges[(idx1, idx2)] = idx
|
106 |
+
idx += 1
|
107 |
+
self.merges = merges
|
108 |
+
self.special_tokens = special_tokens
|
109 |
+
self.vocab = self._build_vocab()
|