Chris4K commited on
Commit
0646ad5
1 Parent(s): 738ada4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -0
app.py CHANGED
@@ -343,6 +343,23 @@ def visualize_results(results_df, stats_df):
343
  plt.tight_layout()
344
  return fig
345
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
346
  # Main Comparison Function
347
  def compare_embeddings(file, query, model_types, model_names, split_strategy, chunk_size, overlap_size, custom_separators, vector_store_type, search_type, top_k, lang='german', use_custom_embedding=False, optimize_vocab=False, phonetic_weight=0.3, custom_tokenizer_file=None):
348
  all_results = []
 
343
  plt.tight_layout()
344
  return fig
345
 
346
+ def optimize_vocabulary(texts, vocab_size=10000, min_frequency=2):
347
+ # Count word frequencies
348
+ word_freq = Counter(word for text in texts for word in text.split())
349
+
350
+ # Remove rare words
351
+ optimized_texts = [
352
+ ' '.join(word for word in text.split() if word_freq[word] >= min_frequency)
353
+ for text in texts
354
+ ]
355
+
356
+ # Train BPE tokenizer
357
+ tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
358
+ trainer = BpeTrainer(vocab_size=vocab_size, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
359
+ tokenizer.train_from_iterator(optimized_texts, trainer)
360
+
361
+ return tokenizer, optimized_texts
362
+
363
  # Main Comparison Function
364
  def compare_embeddings(file, query, model_types, model_names, split_strategy, chunk_size, overlap_size, custom_separators, vector_store_type, search_type, top_k, lang='german', use_custom_embedding=False, optimize_vocab=False, phonetic_weight=0.3, custom_tokenizer_file=None):
365
  all_results = []