Chris4K commited on
Commit
027365f
1 Parent(s): 0646ad5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +4 -2
app.py CHANGED
@@ -20,7 +20,7 @@ import jellyfish
20
  from gensim.models import Word2Vec
21
  from gensim.models.fasttext import FastText
22
  from collections import Counter
23
- from tokenizers import Tokenizer
24
  from tokenizers.models import WordLevel
25
  from tokenizers.trainers import WordLevelTrainer
26
  from tokenizers.pre_tokenizers import Whitespace
@@ -344,6 +344,8 @@ def visualize_results(results_df, stats_df):
344
  return fig
345
 
346
  def optimize_vocabulary(texts, vocab_size=10000, min_frequency=2):
 
 
347
  # Count word frequencies
348
  word_freq = Counter(word for text in texts for word in text.split())
349
 
@@ -354,7 +356,7 @@ def optimize_vocabulary(texts, vocab_size=10000, min_frequency=2):
354
  ]
355
 
356
  # Train BPE tokenizer
357
- tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
358
  trainer = BpeTrainer(vocab_size=vocab_size, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
359
  tokenizer.train_from_iterator(optimized_texts, trainer)
360
 
 
20
  from gensim.models import Word2Vec
21
  from gensim.models.fasttext import FastText
22
  from collections import Counter
23
+ from tokenizers import Tokenizer, models
24
  from tokenizers.models import WordLevel
25
  from tokenizers.trainers import WordLevelTrainer
26
  from tokenizers.pre_tokenizers import Whitespace
 
344
  return fig
345
 
346
  def optimize_vocabulary(texts, vocab_size=10000, min_frequency=2):
347
+ tokenizer = Tokenizer(models.BPE(unk_token="[UNK]"))
348
+
349
  # Count word frequencies
350
  word_freq = Counter(word for text in texts for word in text.split())
351
 
 
356
  ]
357
 
358
  # Train BPE tokenizer
359
+ # tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
360
  trainer = BpeTrainer(vocab_size=vocab_size, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
361
  tokenizer.train_from_iterator(optimized_texts, trainer)
362