Update app.py
Browse files
app.py
CHANGED
@@ -343,6 +343,23 @@ def visualize_results(results_df, stats_df):
|
|
343 |
plt.tight_layout()
|
344 |
return fig
|
345 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
346 |
# Main Comparison Function
|
347 |
def compare_embeddings(file, query, model_types, model_names, split_strategy, chunk_size, overlap_size, custom_separators, vector_store_type, search_type, top_k, lang='german', use_custom_embedding=False, optimize_vocab=False, phonetic_weight=0.3, custom_tokenizer_file=None):
|
348 |
all_results = []
|
|
|
343 |
plt.tight_layout()
|
344 |
return fig
|
345 |
|
346 |
+
def optimize_vocabulary(texts, vocab_size=10000, min_frequency=2):
|
347 |
+
# Count word frequencies
|
348 |
+
word_freq = Counter(word for text in texts for word in text.split())
|
349 |
+
|
350 |
+
# Remove rare words
|
351 |
+
optimized_texts = [
|
352 |
+
' '.join(word for word in text.split() if word_freq[word] >= min_frequency)
|
353 |
+
for text in texts
|
354 |
+
]
|
355 |
+
|
356 |
+
# Train BPE tokenizer
|
357 |
+
tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
|
358 |
+
trainer = BpeTrainer(vocab_size=vocab_size, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
|
359 |
+
tokenizer.train_from_iterator(optimized_texts, trainer)
|
360 |
+
|
361 |
+
return tokenizer, optimized_texts
|
362 |
+
|
363 |
# Main Comparison Function
|
364 |
def compare_embeddings(file, query, model_types, model_names, split_strategy, chunk_size, overlap_size, custom_separators, vector_store_type, search_type, top_k, lang='german', use_custom_embedding=False, optimize_vocab=False, phonetic_weight=0.3, custom_tokenizer_file=None):
|
365 |
all_results = []
|