fxtentacle
commited on
Commit
•
b9e6d62
1
Parent(s):
b93b4b2
Update README.md
Browse files
README.md
CHANGED
@@ -1,5 +1,254 @@
|
|
1 |
This repo contains the fully trained ByT5 that was used to estimate per-character entropies. Using it, you can also recreate the illustration in the paper.
|
2 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
|
4 |
## Testing Tokenizer File
|
5 |
(copy of `TEVR Explanation.ipynb`)
|
|
|
1 |
This repo contains the fully trained ByT5 that was used to estimate per-character entropies. Using it, you can also recreate the illustration in the paper.
|
2 |
|
3 |
+
## Generate TEVR Tokenizer from Text corpus
|
4 |
+
(copy of `Generate TEVR Tokenizer.ipynb`)
|
5 |
+
|
6 |
+
```python
|
7 |
+
# TODO: load large text dataset like OSCAR
|
8 |
+
all_sentences_de = ["Über vier Jahrzehnte gehörte er zu den führenden Bildhauern Niederbayerns", "die katze ist niedlich"] * 1000
|
9 |
+
```
|
10 |
+
|
11 |
+
|
12 |
+
```python
|
13 |
+
from huggingface_hub import snapshot_download
|
14 |
+
data_folder = snapshot_download("fxtentacle/tevr-token-entropy-predictor-de")
|
15 |
+
```
|
16 |
+
|
17 |
+
|
18 |
+
```python
|
19 |
+
from transformers import T5ForConditionalGeneration
|
20 |
+
model = T5ForConditionalGeneration.from_pretrained(data_folder)
|
21 |
+
model.to('cuda')
|
22 |
+
model.eval()
|
23 |
+
None
|
24 |
+
```
|
25 |
+
|
26 |
+
|
27 |
+
```python
|
28 |
+
import torch
|
29 |
+
|
30 |
+
def text_to_cross_entropy(text):
|
31 |
+
ttext = torch.tensor([[0]+list(text.encode('UTF-8'))],dtype=torch.int64).to('cuda')
|
32 |
+
tone = torch.tensor([[1]],dtype=torch.int32).to('cuda')
|
33 |
+
logits = model.forward(input_ids=tone, attention_mask=tone, decoder_input_ids=ttext, return_dict=False)[0].detach()
|
34 |
+
cross_entropy = torch.nn.functional.cross_entropy(input=logits[0][:-1], target=ttext[0][1:], reduction='none').detach().cpu().numpy()
|
35 |
+
return cross_entropy
|
36 |
+
```
|
37 |
+
|
38 |
+
|
39 |
+
```python
|
40 |
+
text = all_sentences_de[0]
|
41 |
+
cross_entropy = text_to_cross_entropy(text)
|
42 |
+
print(text)
|
43 |
+
for i in range(len(text)):
|
44 |
+
print(text[i], cross_entropy[i])
|
45 |
+
```
|
46 |
+
|
47 |
+
Über vier Jahrzehnte gehörte er zu den führenden Bildhauern Niederbayerns
|
48 |
+
Ü 7.254014
|
49 |
+
b 0.17521738
|
50 |
+
e 0.00046933602
|
51 |
+
r 0.01929327
|
52 |
+
0.0003675739
|
53 |
+
v 0.20927554
|
54 |
+
i 6.13207
|
55 |
+
e 0.3896482
|
56 |
+
r 0.009583538
|
57 |
+
2.07364
|
58 |
+
J 0.02978594
|
59 |
+
a 2.483246
|
60 |
+
h 0.1591908
|
61 |
+
r 0.0045124847
|
62 |
+
z 0.00028653807
|
63 |
+
e 4.0242333
|
64 |
+
h 0.031035878
|
65 |
+
n 0.028907888
|
66 |
+
t 0.003264101
|
67 |
+
e 0.0018929198
|
68 |
+
0.05816966
|
69 |
+
g 1.2782481
|
70 |
+
e 3.5076692
|
71 |
+
h 0.694337
|
72 |
+
ö 0.5319732
|
73 |
+
r 0.48336726
|
74 |
+
t 0.0050443523
|
75 |
+
e 0.0017187123
|
76 |
+
0.14511283
|
77 |
+
e 1.0435015
|
78 |
+
r 0.18165778
|
79 |
+
1.0247636
|
80 |
+
z 0.3594512
|
81 |
+
u 0.0077577736
|
82 |
+
2.072764
|
83 |
+
d 0.17377533
|
84 |
+
e 1.0727838
|
85 |
+
n 1.2805216
|
86 |
+
0.24939628
|
87 |
+
f 0.27717885
|
88 |
+
ü 0.012466482
|
89 |
+
h 4.4356546
|
90 |
+
r 1.7371752
|
91 |
+
e 0.051492628
|
92 |
+
n 2.99407
|
93 |
+
d 0.009648594
|
94 |
+
e 0.19667451
|
95 |
+
n 0.007495021
|
96 |
+
0.2529005
|
97 |
+
B 0.004451485
|
98 |
+
i 0.024661187
|
99 |
+
l 0.0028436247
|
100 |
+
d 2.6620464
|
101 |
+
h 2.825038
|
102 |
+
a 0.8215449
|
103 |
+
u 0.011406565
|
104 |
+
e 2.9599652
|
105 |
+
r 0.45834702
|
106 |
+
n 0.11848967
|
107 |
+
0.5955992
|
108 |
+
N 0.010709903
|
109 |
+
i 1.5338714
|
110 |
+
e 0.1834471
|
111 |
+
d 5.668945
|
112 |
+
e 2.052247
|
113 |
+
r 0.7692907
|
114 |
+
b 0.0675718
|
115 |
+
a 0.028234791
|
116 |
+
y 0.0045266068
|
117 |
+
e 4.1125383
|
118 |
+
r 1.2630856
|
119 |
+
n 5.436057
|
120 |
+
s 0.46446246
|
121 |
+
|
122 |
+
|
123 |
+
|
124 |
+
```python
|
125 |
+
from tqdm import tqdm
|
126 |
+
|
127 |
+
sentence_data = all_sentences_de
|
128 |
+
|
129 |
+
text_and_entropies = []
|
130 |
+
for text in tqdm(sentence_data):
|
131 |
+
text_and_entropies.append([text,text_to_cross_entropy(text)])
|
132 |
+
```
|
133 |
+
|
134 |
+
100%|██████████| 2000/2000 [00:09<00:00, 219.00it/s]
|
135 |
+
|
136 |
+
|
137 |
+
|
138 |
+
```python
|
139 |
+
from collections import Counter
|
140 |
+
|
141 |
+
# 4s
|
142 |
+
#target_lengths = [1]
|
143 |
+
#token_budgets = [36]
|
144 |
+
|
145 |
+
# 4m
|
146 |
+
target_lengths = [4,3,2,1]
|
147 |
+
token_budgets = [40,80,96,36]
|
148 |
+
|
149 |
+
# 4l
|
150 |
+
#target_lengths = [4,3,2,1]
|
151 |
+
#token_budgets = [384,320,160,36]
|
152 |
+
|
153 |
+
ngrams = [Counter() for l in target_lengths]
|
154 |
+
tokens = []
|
155 |
+
|
156 |
+
for tgi,tgl in enumerate(target_lengths):
|
157 |
+
for row in tqdm(text_and_entropies[1:]):
|
158 |
+
use_text = row[0]
|
159 |
+
use_scores = row[1]
|
160 |
+
for t in tokens:
|
161 |
+
use_text = use_text.replace(t[0],'#')
|
162 |
+
candidates = []
|
163 |
+
for i in range(len(use_text)-(tgl-1)):
|
164 |
+
part = use_text[i:i+tgl].lower()
|
165 |
+
if '#' in part: continue
|
166 |
+
if ' ' in part: continue
|
167 |
+
if '-' in part: continue
|
168 |
+
score = sum(use_scores[i:i+tgl])
|
169 |
+
# print(part, score)
|
170 |
+
candidates.append([score, part])
|
171 |
+
candidates.sort(reverse=False)
|
172 |
+
candidates = candidates[:max(1,int(len(candidates)/5))]
|
173 |
+
#print(candidates)
|
174 |
+
ngrams[tgi].update([c[1] for c in candidates])
|
175 |
+
new_tokens = ngrams[tgi].most_common(token_budgets[tgi])
|
176 |
+
print(new_tokens)
|
177 |
+
tokens += new_tokens
|
178 |
+
#break
|
179 |
+
```
|
180 |
+
|
181 |
+
100%|██████████| 1999/1999 [00:00<00:00, 14645.88it/s]
|
182 |
+
|
183 |
+
|
184 |
+
[('lich', 1000), ('hnte', 999), ('rbay', 999), ('örte', 999), ('hört', 999), ('ahrz', 999), ('jahr', 999), ('bild', 999)]
|
185 |
+
|
186 |
+
|
187 |
+
100%|██████████| 1999/1999 [00:00<00:00, 18574.04it/s]
|
188 |
+
|
189 |
+
|
190 |
+
[('ist', 1000), ('den', 999), ('ber', 999), ('aue', 999), ('ern', 999), ('uer', 999)]
|
191 |
+
|
192 |
+
|
193 |
+
100%|██████████| 1999/1999 [00:00<00:00, 20827.32it/s]
|
194 |
+
|
195 |
+
|
196 |
+
[('ni', 1000), ('ge', 999), ('er', 999), ('fü', 999), ('vi', 999)]
|
197 |
+
|
198 |
+
|
199 |
+
100%|██████████| 1999/1999 [00:00<00:00, 19927.45it/s]
|
200 |
+
|
201 |
+
[('e', 2999), ('u', 999), ('n', 999), ('h', 999)]
|
202 |
+
|
203 |
+
|
204 |
+
|
205 |
+
|
206 |
+
|
207 |
+
|
208 |
+
```python
|
209 |
+
all_tokens = ['<pad>','<eos>',' ']+[t[0] for t in tokens]+['?']
|
210 |
+
print(len(all_tokens), all_tokens)
|
211 |
+
```
|
212 |
+
|
213 |
+
27 ['<pad>', '<eos>', ' ', 'lich', 'hnte', 'rbay', 'örte', 'hört', 'ahrz', 'jahr', 'bild', 'ist', 'den', 'ber', 'aue', 'ern', 'uer', 'ni', 'ge', 'er', 'fü', 'vi', 'e', 'u', 'n', 'h', '?']
|
214 |
+
|
215 |
+
|
216 |
+
|
217 |
+
```python
|
218 |
+
import json
|
219 |
+
with open('./tevr-tokenizer.txt','wt') as f:
|
220 |
+
json.dump(all_tokens, f)
|
221 |
+
```
|
222 |
+
|
223 |
+
|
224 |
+
```python
|
225 |
+
import sys
|
226 |
+
import os
|
227 |
+
sys.path.append(data_folder)
|
228 |
+
from text_tokenizer import HajoTextTokenizer
|
229 |
+
```
|
230 |
+
|
231 |
+
|
232 |
+
```python
|
233 |
+
text_tokenizer = HajoTextTokenizer('./tevr-tokenizer.txt')
|
234 |
+
```
|
235 |
+
|
236 |
+
|
237 |
+
```python
|
238 |
+
sentence = "gehörte"
|
239 |
+
print(sentence)
|
240 |
+
encoded = text_tokenizer.encode(sentence)
|
241 |
+
print(encoded)
|
242 |
+
print([text_tokenizer.all_tokens[i] for i in encoded])
|
243 |
+
print([text_tokenizer.decode(encoded)])
|
244 |
+
```
|
245 |
+
|
246 |
+
gehörte
|
247 |
+
[18, 25, 6]
|
248 |
+
['ge', 'h', 'örte']
|
249 |
+
['gehörte']
|
250 |
+
|
251 |
+
|
252 |
|
253 |
## Testing Tokenizer File
|
254 |
(copy of `TEVR Explanation.ipynb`)
|