Update app.py
Browse files
app.py
CHANGED
@@ -20,7 +20,7 @@ import jellyfish
|
|
20 |
from gensim.models import Word2Vec
|
21 |
from gensim.models.fasttext import FastText
|
22 |
from collections import Counter
|
23 |
-
from tokenizers import Tokenizer, models
|
24 |
from tokenizers.models import WordLevel
|
25 |
from tokenizers.trainers import WordLevelTrainer
|
26 |
from tokenizers.pre_tokenizers import Whitespace
|
@@ -357,7 +357,7 @@ def optimize_vocabulary(texts, vocab_size=10000, min_frequency=2):
|
|
357 |
|
358 |
# Train BPE tokenizer
|
359 |
# tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
|
360 |
-
trainer =
|
361 |
tokenizer.train_from_iterator(optimized_texts, trainer)
|
362 |
|
363 |
return tokenizer, optimized_texts
|
|
|
20 |
from gensim.models import Word2Vec
|
21 |
from gensim.models.fasttext import FastText
|
22 |
from collections import Counter
|
23 |
+
from tokenizers import Tokenizer, models, trainers
|
24 |
from tokenizers.models import WordLevel
|
25 |
from tokenizers.trainers import WordLevelTrainer
|
26 |
from tokenizers.pre_tokenizers import Whitespace
|
|
|
357 |
|
358 |
# Train BPE tokenizer
|
359 |
# tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
|
360 |
+
trainer = trainers.BpeTrainer(vocab_size=vocab_size, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
|
361 |
tokenizer.train_from_iterator(optimized_texts, trainer)
|
362 |
|
363 |
return tokenizer, optimized_texts
|