Update app.py
Browse files
app.py
CHANGED
@@ -30,6 +30,9 @@ from sklearn.manifold import TSNE
|
|
30 |
from sklearn.metrics import silhouette_score
|
31 |
from scipy.stats import spearmanr
|
32 |
from functools import lru_cache
|
|
|
|
|
|
|
33 |
|
34 |
# NLTK Resource Download
|
35 |
def download_nltk_resources():
|
@@ -141,13 +144,11 @@ def preprocess_text(text, lang='german'):
|
|
141 |
def phonetic_match(text, query, method='levenshtein_distance'):
|
142 |
if method == 'levenshtein_distance':
|
143 |
text_phonetic = jellyfish.soundex(text)
|
144 |
-
#query_phonetic = jellyfish.cologne_phonetic(query)
|
145 |
query_phonetic = jellyfish.soundex(query)
|
146 |
return jellyfish.levenshtein_distance(text_phonetic, query_phonetic)
|
147 |
return 0
|
148 |
|
149 |
def create_custom_embedding(texts, model_type='word2vec', vector_size=100, window=5, min_count=1):
|
150 |
-
# Tokenize the texts
|
151 |
tokenized_texts = [text.split() for text in texts]
|
152 |
|
153 |
if model_type == 'word2vec':
|
@@ -169,7 +170,6 @@ class CustomEmbeddings(HuggingFaceEmbeddings):
|
|
169 |
def embed_query(self, text):
|
170 |
return self.model.wv[text.split()]
|
171 |
|
172 |
-
|
173 |
# Custom Tokenizer
|
174 |
def create_custom_tokenizer(file_path, model_type='WordLevel', vocab_size=10000, special_tokens=None):
|
175 |
with open(file_path, 'r', encoding='utf-8') as f:
|
@@ -191,6 +191,7 @@ def create_custom_tokenizer(file_path, model_type='WordLevel', vocab_size=10000,
|
|
191 |
tokenizer.train_from_iterator([text], trainer)
|
192 |
|
193 |
return tokenizer
|
|
|
194 |
def custom_tokenize(text, tokenizer):
|
195 |
return tokenizer.encode(text).tokens
|
196 |
|
@@ -220,15 +221,16 @@ def get_text_splitter(split_strategy, chunk_size, overlap_size, custom_separator
|
|
220 |
raise ValueError(f"Unsupported split strategy: {split_strategy}")
|
221 |
|
222 |
def get_vector_store(vector_store_type, chunks, embedding_model):
|
223 |
-
# Convert chunks to a tuple to make it hashable
|
224 |
chunks_tuple = tuple(chunks)
|
225 |
-
|
226 |
-
# Use a helper function for the actual vector store creation
|
227 |
return _create_vector_store(vector_store_type, chunks_tuple, embedding_model)
|
228 |
|
229 |
-
|
|
|
|
|
|
|
|
|
|
|
230 |
def _create_vector_store(vector_store_type, chunks_tuple, embedding_model):
|
231 |
-
# Convert the tuple back to a list for use with the vector store
|
232 |
chunks = list(chunks_tuple)
|
233 |
|
234 |
if vector_store_type == 'FAISS':
|
@@ -238,15 +240,13 @@ def _create_vector_store(vector_store_type, chunks_tuple, embedding_model):
|
|
238 |
else:
|
239 |
raise ValueError(f"Unsupported vector store type: {vector_store_type}")
|
240 |
|
241 |
-
|
242 |
def get_retriever(vector_store, search_type, search_kwargs):
|
243 |
if search_type == 'similarity':
|
244 |
return vector_store.as_retriever(search_type="similarity", search_kwargs=search_kwargs)
|
245 |
elif search_type == 'mmr':
|
246 |
return vector_store.as_retriever(search_type="mmr", search_kwargs=search_kwargs)
|
247 |
elif search_type == 'custom':
|
248 |
-
|
249 |
-
pass
|
250 |
else:
|
251 |
raise ValueError(f"Unsupported search type: {search_type}")
|
252 |
|
@@ -290,15 +290,13 @@ def search_embeddings(chunks, embedding_model, vector_store_type, search_type, q
|
|
290 |
results = sorted(results, key=score_result, reverse=True)
|
291 |
end_time = time.time()
|
292 |
|
293 |
-
# Check if embeddings are available
|
294 |
embeddings = []
|
295 |
for doc in results:
|
296 |
if hasattr(doc, 'embedding'):
|
297 |
-
embeddings.append(doc.embedding)
|
298 |
else:
|
299 |
-
embeddings.append(None)
|
300 |
|
301 |
-
# Create a DataFrame with the results and embeddings
|
302 |
results_df = pd.DataFrame({
|
303 |
'content': [doc.page_content for doc in results],
|
304 |
'embedding': embeddings
|
@@ -307,13 +305,12 @@ def search_embeddings(chunks, embedding_model, vector_store_type, search_type, q
|
|
307 |
return results_df, end_time - start_time, vector_store, results
|
308 |
|
309 |
# Evaluation Metrics
|
|
|
|
|
310 |
def calculate_statistics(results, search_time, vector_store, num_tokens, embedding_model, query, top_k):
|
311 |
stats = {
|
312 |
"num_results": len(results),
|
313 |
-
# "avg_content_length": sum(len(doc.page_content) for doc in results) / len(results) if results else 0,
|
314 |
"avg_content_length": np.mean([len(doc.page_content) for doc in results]) if results else 0,
|
315 |
-
|
316 |
-
#"avg_content_length": np.mean([len(doc.page_content) for doc in results]) if not results.empty else 0,
|
317 |
"search_time": search_time,
|
318 |
"vector_store_size": vector_store._index.ntotal if hasattr(vector_store, '_index') else "N/A",
|
319 |
"num_documents": len(vector_store.docstore._dict),
|
@@ -328,10 +325,7 @@ def calculate_statistics(results, search_time, vector_store, num_tokens, embeddi
|
|
328 |
pairwise_similarities = np.inner(embeddings, embeddings)
|
329 |
stats["result_diversity"] = 1 - np.mean(pairwise_similarities[np.triu_indices(len(embeddings), k=1)])
|
330 |
|
331 |
-
# Silhouette Score
|
332 |
if len(embeddings) > 2:
|
333 |
-
print('-----')
|
334 |
-
#stats["silhouette_score"] = "N/A"
|
335 |
stats["silhouette_score"] = silhouette_score(embeddings, range(len(embeddings)))
|
336 |
else:
|
337 |
stats["silhouette_score"] = "N/A"
|
@@ -378,24 +372,34 @@ def visualize_results(results_df, stats_df):
|
|
378 |
def optimize_vocabulary(texts, vocab_size=10000, min_frequency=2):
|
379 |
tokenizer = Tokenizer(models.BPE(unk_token="[UNK]"))
|
380 |
|
381 |
-
# Count word frequencies
|
382 |
word_freq = Counter(word for text in texts for word in text.split())
|
383 |
|
384 |
-
# Remove rare words
|
385 |
optimized_texts = [
|
386 |
' '.join(word for word in text.split() if word_freq[word] >= min_frequency)
|
387 |
for text in texts
|
388 |
]
|
389 |
|
390 |
-
# Train BPE tokenizer
|
391 |
-
# tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
|
392 |
trainer = trainers.BpeTrainer(vocab_size=vocab_size, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
|
393 |
tokenizer.train_from_iterator(optimized_texts, trainer)
|
394 |
|
395 |
return tokenizer, optimized_texts
|
396 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
397 |
# Main Comparison Function
|
398 |
-
def compare_embeddings(file, query, embedding_models, custom_embedding_model, split_strategy, chunk_size, overlap_size, custom_separators, vector_store_type, search_type, top_k, lang='german', optimize_vocab=False, phonetic_weight=0.3, custom_tokenizer_file=None, custom_tokenizer_model=None, custom_tokenizer_vocab_size=10000, custom_tokenizer_special_tokens=None):
|
399 |
all_results = []
|
400 |
all_stats = []
|
401 |
settings = {
|
@@ -408,16 +412,16 @@ def compare_embeddings(file, query, embedding_models, custom_embedding_model, sp
|
|
408 |
"top_k": top_k,
|
409 |
"lang": lang,
|
410 |
"optimize_vocab": optimize_vocab,
|
411 |
-
"phonetic_weight": phonetic_weight
|
|
|
|
|
412 |
}
|
413 |
|
414 |
-
# Parse embedding models
|
415 |
models = [model.strip().split(':') for model in embedding_models.split(',')]
|
416 |
if custom_embedding_model:
|
417 |
models.append(custom_embedding_model.strip().split(':'))
|
418 |
|
419 |
for model_type, model_name in models:
|
420 |
-
# Process the file and generate chunks & embeddings
|
421 |
chunks, embedding_model, num_tokens = process_files(
|
422 |
file.name if file else None,
|
423 |
model_type,
|
@@ -433,17 +437,19 @@ def compare_embeddings(file, query, embedding_models, custom_embedding_model, sp
|
|
433 |
custom_tokenizer_special_tokens.split(',') if custom_tokenizer_special_tokens else None
|
434 |
)
|
435 |
|
436 |
-
# Custom embedding handling
|
437 |
-
#if use_custom_embedding:
|
438 |
-
# custom_model = create_custom_embedding(chunks) #add custom model by name, must com from gradio FE
|
439 |
-
# embedding_model = CustomEmbeddings(custom_model)
|
440 |
-
|
441 |
-
# Optimizing vocabulary if required
|
442 |
if optimize_vocab:
|
443 |
tokenizer, optimized_chunks = optimize_vocabulary(chunks)
|
444 |
chunks = optimized_chunks
|
445 |
|
446 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
447 |
results, search_time, vector_store, results_raw = search_embeddings(
|
448 |
chunks,
|
449 |
embedding_model,
|
@@ -455,32 +461,26 @@ def compare_embeddings(file, query, embedding_models, custom_embedding_model, sp
|
|
455 |
phonetic_weight
|
456 |
)
|
457 |
|
458 |
-
|
459 |
-
|
460 |
-
|
461 |
|
462 |
-
|
463 |
-
|
464 |
-
result_embeddings = [doc.metadata.get('embedding', None) for doc in results_raw] # Adjust this based on the actual attribute names
|
465 |
-
# result_embeddings = [doc['embedding'] for doc in results_raw] # Assuming each result has an embedding
|
466 |
|
467 |
stats = calculate_statistics(results_raw, search_time, vector_store, num_tokens, embedding_model, query, top_k)
|
468 |
stats["model"] = f"{model_type} - {model_name}"
|
469 |
stats.update(settings)
|
470 |
|
471 |
-
# Formatting results and attaching embeddings
|
472 |
formatted_results = format_results(results_raw, stats)
|
473 |
for i, result in enumerate(formatted_results):
|
474 |
-
result['embedding'] = result_embeddings[i]
|
475 |
|
476 |
all_results.extend(formatted_results)
|
477 |
all_stats.append(stats)
|
478 |
|
479 |
-
# Create DataFrames with embeddings now included
|
480 |
results_df = pd.DataFrame(all_results)
|
481 |
stats_df = pd.DataFrame(all_stats)
|
482 |
|
483 |
-
# Visualization of the results
|
484 |
fig = visualize_results(results_df, stats_df)
|
485 |
|
486 |
return results_df, stats_df, fig
|
@@ -500,36 +500,52 @@ def format_results(results, stats):
|
|
500 |
|
501 |
# Gradio Interface
|
502 |
def launch_interface(share=True):
|
503 |
-
|
504 |
-
|
505 |
-
|
506 |
-
|
507 |
-
gr.
|
508 |
-
gr.Textbox(label="
|
509 |
-
gr.Textbox(label="
|
510 |
-
gr.
|
511 |
-
|
512 |
-
|
513 |
-
gr.Textbox(label="Custom
|
514 |
-
gr.Radio(choices=["
|
515 |
-
gr.
|
516 |
-
gr.Slider(
|
517 |
-
gr.
|
518 |
-
gr.
|
519 |
-
gr.
|
520 |
-
gr.
|
521 |
-
|
522 |
-
|
523 |
-
gr.
|
524 |
-
|
525 |
-
|
526 |
-
gr.
|
527 |
-
gr.
|
528 |
-
gr.
|
529 |
-
|
530 |
-
|
531 |
-
|
532 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
533 |
|
534 |
tutorial_md = """
|
535 |
# Advanced Embedding Comparison Tool Tutorial
|
@@ -541,13 +557,10 @@ def launch_interface(share=True):
|
|
541 |
1. Upload a file (optional) or use the default files in the system.
|
542 |
2. Enter a search query.
|
543 |
3. Enter embedding models as a comma-separated list (e.g., HuggingFace:paraphrase-miniLM,OpenAI:text-embedding-ada-002).
|
544 |
-
4.
|
545 |
-
5.
|
546 |
-
6.
|
547 |
-
7.
|
548 |
-
8. Choose the language of your documents.
|
549 |
-
9. Optionally, optimize vocabulary or adjust phonetic matching weight.
|
550 |
-
10. If you have a custom tokenizer, upload the file and specify its attributes.
|
551 |
|
552 |
The tool will process your query and display results, statistics, and visualizations to help you compare the performance of different models and strategies.
|
553 |
"""
|
@@ -559,4 +572,5 @@ def launch_interface(share=True):
|
|
559 |
|
560 |
iface.launch(share=share)
|
561 |
|
562 |
-
|
|
|
|
30 |
from sklearn.metrics import silhouette_score
|
31 |
from scipy.stats import spearmanr
|
32 |
from functools import lru_cache
|
33 |
+
from langchain.retrievers import MultiQueryRetriever
|
34 |
+
from langchain.llms import HuggingFacePipeline
|
35 |
+
from transformers import pipeline
|
36 |
|
37 |
# NLTK Resource Download
|
38 |
def download_nltk_resources():
|
|
|
144 |
def phonetic_match(text, query, method='levenshtein_distance'):
|
145 |
if method == 'levenshtein_distance':
|
146 |
text_phonetic = jellyfish.soundex(text)
|
|
|
147 |
query_phonetic = jellyfish.soundex(query)
|
148 |
return jellyfish.levenshtein_distance(text_phonetic, query_phonetic)
|
149 |
return 0
|
150 |
|
151 |
def create_custom_embedding(texts, model_type='word2vec', vector_size=100, window=5, min_count=1):
|
|
|
152 |
tokenized_texts = [text.split() for text in texts]
|
153 |
|
154 |
if model_type == 'word2vec':
|
|
|
170 |
def embed_query(self, text):
|
171 |
return self.model.wv[text.split()]
|
172 |
|
|
|
173 |
# Custom Tokenizer
|
174 |
def create_custom_tokenizer(file_path, model_type='WordLevel', vocab_size=10000, special_tokens=None):
|
175 |
with open(file_path, 'r', encoding='utf-8') as f:
|
|
|
191 |
tokenizer.train_from_iterator([text], trainer)
|
192 |
|
193 |
return tokenizer
|
194 |
+
|
195 |
def custom_tokenize(text, tokenizer):
|
196 |
return tokenizer.encode(text).tokens
|
197 |
|
|
|
221 |
raise ValueError(f"Unsupported split strategy: {split_strategy}")
|
222 |
|
223 |
def get_vector_store(vector_store_type, chunks, embedding_model):
|
|
|
224 |
chunks_tuple = tuple(chunks)
|
|
|
|
|
225 |
return _create_vector_store(vector_store_type, chunks_tuple, embedding_model)
|
226 |
|
227 |
+
def custom_similarity(query_embedding, doc_embedding, query, doc_text, phonetic_weight=0.3):
|
228 |
+
embedding_sim = np.dot(query_embedding, doc_embedding) / (np.linalg.norm(query_embedding) * np.linalg.norm(doc_embedding))
|
229 |
+
phonetic_sim = phonetic_match(doc_text, query)
|
230 |
+
combined_sim = (1 - phonetic_weight) * embedding_sim + phonetic_weight * phonetic_sim
|
231 |
+
return combined_sim
|
232 |
+
|
233 |
def _create_vector_store(vector_store_type, chunks_tuple, embedding_model):
|
|
|
234 |
chunks = list(chunks_tuple)
|
235 |
|
236 |
if vector_store_type == 'FAISS':
|
|
|
240 |
else:
|
241 |
raise ValueError(f"Unsupported vector store type: {vector_store_type}")
|
242 |
|
|
|
243 |
def get_retriever(vector_store, search_type, search_kwargs):
|
244 |
if search_type == 'similarity':
|
245 |
return vector_store.as_retriever(search_type="similarity", search_kwargs=search_kwargs)
|
246 |
elif search_type == 'mmr':
|
247 |
return vector_store.as_retriever(search_type="mmr", search_kwargs=search_kwargs)
|
248 |
elif search_type == 'custom':
|
249 |
+
return vector_store.as_retriever(search_type="similarity", search_kwargs=search_kwargs)
|
|
|
250 |
else:
|
251 |
raise ValueError(f"Unsupported search type: {search_type}")
|
252 |
|
|
|
290 |
results = sorted(results, key=score_result, reverse=True)
|
291 |
end_time = time.time()
|
292 |
|
|
|
293 |
embeddings = []
|
294 |
for doc in results:
|
295 |
if hasattr(doc, 'embedding'):
|
296 |
+
embeddings.append(doc.embedding)
|
297 |
else:
|
298 |
+
embeddings.append(None)
|
299 |
|
|
|
300 |
results_df = pd.DataFrame({
|
301 |
'content': [doc.page_content for doc in results],
|
302 |
'embedding': embeddings
|
|
|
305 |
return results_df, end_time - start_time, vector_store, results
|
306 |
|
307 |
# Evaluation Metrics
|
308 |
+
# ... (previous code remains the same)
|
309 |
+
|
310 |
def calculate_statistics(results, search_time, vector_store, num_tokens, embedding_model, query, top_k):
|
311 |
stats = {
|
312 |
"num_results": len(results),
|
|
|
313 |
"avg_content_length": np.mean([len(doc.page_content) for doc in results]) if results else 0,
|
|
|
|
|
314 |
"search_time": search_time,
|
315 |
"vector_store_size": vector_store._index.ntotal if hasattr(vector_store, '_index') else "N/A",
|
316 |
"num_documents": len(vector_store.docstore._dict),
|
|
|
325 |
pairwise_similarities = np.inner(embeddings, embeddings)
|
326 |
stats["result_diversity"] = 1 - np.mean(pairwise_similarities[np.triu_indices(len(embeddings), k=1)])
|
327 |
|
|
|
328 |
if len(embeddings) > 2:
|
|
|
|
|
329 |
stats["silhouette_score"] = silhouette_score(embeddings, range(len(embeddings)))
|
330 |
else:
|
331 |
stats["silhouette_score"] = "N/A"
|
|
|
372 |
def optimize_vocabulary(texts, vocab_size=10000, min_frequency=2):
|
373 |
tokenizer = Tokenizer(models.BPE(unk_token="[UNK]"))
|
374 |
|
|
|
375 |
word_freq = Counter(word for text in texts for word in text.split())
|
376 |
|
|
|
377 |
optimized_texts = [
|
378 |
' '.join(word for word in text.split() if word_freq[word] >= min_frequency)
|
379 |
for text in texts
|
380 |
]
|
381 |
|
|
|
|
|
382 |
trainer = trainers.BpeTrainer(vocab_size=vocab_size, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
|
383 |
tokenizer.train_from_iterator(optimized_texts, trainer)
|
384 |
|
385 |
return tokenizer, optimized_texts
|
386 |
|
387 |
+
# New preprocessing function
|
388 |
+
def optimize_query(query, llm):
|
389 |
+
multi_query_retriever = MultiQueryRetriever.from_llm(
|
390 |
+
retriever=get_retriever(vector_store, search_type, search_kwargs),
|
391 |
+
llm=llm
|
392 |
+
)
|
393 |
+
optimized_queries = multi_query_retriever.generate_queries(query)
|
394 |
+
return optimized_queries
|
395 |
+
|
396 |
+
# New postprocessing function
|
397 |
+
def rerank_results(results, query, reranker):
|
398 |
+
reranked_results = reranker.rerank(query, [doc.page_content for doc in results])
|
399 |
+
return reranked_results
|
400 |
+
|
401 |
# Main Comparison Function
|
402 |
+
def compare_embeddings(file, query, embedding_models, custom_embedding_model, split_strategy, chunk_size, overlap_size, custom_separators, vector_store_type, search_type, top_k, lang='german', optimize_vocab=False, phonetic_weight=0.3, custom_tokenizer_file=None, custom_tokenizer_model=None, custom_tokenizer_vocab_size=10000, custom_tokenizer_special_tokens=None, use_query_optimization=False, use_reranking=False):
|
403 |
all_results = []
|
404 |
all_stats = []
|
405 |
settings = {
|
|
|
412 |
"top_k": top_k,
|
413 |
"lang": lang,
|
414 |
"optimize_vocab": optimize_vocab,
|
415 |
+
"phonetic_weight": phonetic_weight,
|
416 |
+
"use_query_optimization": use_query_optimization,
|
417 |
+
"use_reranking": use_reranking
|
418 |
}
|
419 |
|
|
|
420 |
models = [model.strip().split(':') for model in embedding_models.split(',')]
|
421 |
if custom_embedding_model:
|
422 |
models.append(custom_embedding_model.strip().split(':'))
|
423 |
|
424 |
for model_type, model_name in models:
|
|
|
425 |
chunks, embedding_model, num_tokens = process_files(
|
426 |
file.name if file else None,
|
427 |
model_type,
|
|
|
437 |
custom_tokenizer_special_tokens.split(',') if custom_tokenizer_special_tokens else None
|
438 |
)
|
439 |
|
|
|
|
|
|
|
|
|
|
|
|
|
440 |
if optimize_vocab:
|
441 |
tokenizer, optimized_chunks = optimize_vocabulary(chunks)
|
442 |
chunks = optimized_chunks
|
443 |
|
444 |
+
if use_query_optimization:
|
445 |
+
llm = HuggingFacePipeline.from_model_id(
|
446 |
+
model_id="google/flan-t5-base",
|
447 |
+
task="text2text-generation",
|
448 |
+
model_kwargs={"temperature": 0, "max_length": 64},
|
449 |
+
)
|
450 |
+
optimized_queries = optimize_query(query, llm)
|
451 |
+
query = " ".join(optimized_queries)
|
452 |
+
|
453 |
results, search_time, vector_store, results_raw = search_embeddings(
|
454 |
chunks,
|
455 |
embedding_model,
|
|
|
461 |
phonetic_weight
|
462 |
)
|
463 |
|
464 |
+
if use_reranking:
|
465 |
+
reranker = pipeline("text-classification", model="cross-encoder/ms-marco-MiniLM-L-12-v2")
|
466 |
+
results_raw = rerank_results(results_raw, query, reranker)
|
467 |
|
468 |
+
result_embeddings = [doc.metadata.get('embedding', None) for doc in results_raw]
|
|
|
|
|
|
|
469 |
|
470 |
stats = calculate_statistics(results_raw, search_time, vector_store, num_tokens, embedding_model, query, top_k)
|
471 |
stats["model"] = f"{model_type} - {model_name}"
|
472 |
stats.update(settings)
|
473 |
|
|
|
474 |
formatted_results = format_results(results_raw, stats)
|
475 |
for i, result in enumerate(formatted_results):
|
476 |
+
result['embedding'] = result_embeddings[i]
|
477 |
|
478 |
all_results.extend(formatted_results)
|
479 |
all_stats.append(stats)
|
480 |
|
|
|
481 |
results_df = pd.DataFrame(all_results)
|
482 |
stats_df = pd.DataFrame(all_stats)
|
483 |
|
|
|
484 |
fig = visualize_results(results_df, stats_df)
|
485 |
|
486 |
return results_df, stats_df, fig
|
|
|
500 |
|
501 |
# Gradio Interface
|
502 |
def launch_interface(share=True):
|
503 |
+
with gr.Blocks() as iface:
|
504 |
+
gr.Markdown("# Advanced Embedding Comparison Tool")
|
505 |
+
|
506 |
+
with gr.Tab("Simple"):
|
507 |
+
file_input = gr.File(label="Upload File (Optional)")
|
508 |
+
query_input = gr.Textbox(label="Search Query")
|
509 |
+
embedding_models_input = gr.Textbox(label="Embedding Models (comma-separated, e.g. HuggingFace:paraphrase-miniLM,OpenAI:text-embedding-ada-002)")
|
510 |
+
top_k_input = gr.Slider(1, 10, step=1, value=5, label="Top K")
|
511 |
+
|
512 |
+
with gr.Tab("Advanced"):
|
513 |
+
custom_embedding_model_input = gr.Textbox(label="Custom Embedding Model (optional, format: type:name)")
|
514 |
+
split_strategy_input = gr.Radio(choices=["token", "recursive"], label="Split Strategy", value="recursive")
|
515 |
+
chunk_size_input = gr.Slider(100, 1000, step=100, value=500, label="Chunk Size")
|
516 |
+
overlap_size_input = gr.Slider(0, 100, step=10, value=50, label="Overlap Size")
|
517 |
+
custom_separators_input = gr.Textbox(label="Custom Split Separators (comma-separated, optional)")
|
518 |
+
vector_store_type_input = gr.Radio(choices=["FAISS", "Chroma"], label="Vector Store Type", value="FAISS")
|
519 |
+
search_type_input = gr.Radio(choices=["similarity", "mmr", "custom"], label="Search Type", value="similarity")
|
520 |
+
lang_input = gr.Dropdown(choices=["german", "english", "french"], label="Language", value="german")
|
521 |
+
|
522 |
+
with gr.Tab("Optional"):
|
523 |
+
optimize_vocab_input = gr.Checkbox(label="Optimize Vocabulary", value=False)
|
524 |
+
phonetic_weight_input = gr.Slider(0, 1, step=0.1, value=0.3, label="Phonetic Matching Weight")
|
525 |
+
custom_tokenizer_file_input = gr.File(label="Custom Tokenizer File (Optional)")
|
526 |
+
custom_tokenizer_model_input = gr.Textbox(label="Custom Tokenizer Model (e.g., WordLevel, BPE, Unigram)")
|
527 |
+
custom_tokenizer_vocab_size_input = gr.Textbox(label="Custom Tokenizer Vocab Size", value="10000")
|
528 |
+
custom_tokenizer_special_tokens_input = gr.Textbox(label="Custom Tokenizer Special Tokens (comma-separated)")
|
529 |
+
use_query_optimization_input = gr.Checkbox(label="Use Query Optimization", value=False)
|
530 |
+
use_reranking_input = gr.Checkbox(label="Use Reranking", value=False)
|
531 |
+
|
532 |
+
results_output = gr.Dataframe(label="Results", interactive=False)
|
533 |
+
stats_output = gr.Dataframe(label="Statistics", interactive=False)
|
534 |
+
plot_output = gr.Plot(label="Visualizations")
|
535 |
+
|
536 |
+
submit_button = gr.Button("Compare Embeddings")
|
537 |
+
submit_button.click(
|
538 |
+
fn=compare_embeddings,
|
539 |
+
inputs=[
|
540 |
+
file_input, query_input, embedding_models_input, custom_embedding_model_input,
|
541 |
+
split_strategy_input, chunk_size_input, overlap_size_input, custom_separators_input,
|
542 |
+
vector_store_type_input, search_type_input, top_k_input, lang_input,
|
543 |
+
optimize_vocab_input, phonetic_weight_input, custom_tokenizer_file_input,
|
544 |
+
custom_tokenizer_model_input, custom_tokenizer_vocab_size_input,
|
545 |
+
custom_tokenizer_special_tokens_input, use_query_optimization_input, use_reranking_input
|
546 |
+
],
|
547 |
+
outputs=[results_output, stats_output, plot_output]
|
548 |
+
)
|
549 |
|
550 |
tutorial_md = """
|
551 |
# Advanced Embedding Comparison Tool Tutorial
|
|
|
557 |
1. Upload a file (optional) or use the default files in the system.
|
558 |
2. Enter a search query.
|
559 |
3. Enter embedding models as a comma-separated list (e.g., HuggingFace:paraphrase-miniLM,OpenAI:text-embedding-ada-002).
|
560 |
+
4. Set the number of top results to retrieve.
|
561 |
+
5. Optionally, specify advanced settings such as custom embedding models, text splitting strategies, and vector store types.
|
562 |
+
6. Choose whether to use optional features like vocabulary optimization, query optimization, or result reranking.
|
563 |
+
7. If you have a custom tokenizer, upload the file and specify its attributes.
|
|
|
|
|
|
|
564 |
|
565 |
The tool will process your query and display results, statistics, and visualizations to help you compare the performance of different models and strategies.
|
566 |
"""
|
|
|
572 |
|
573 |
iface.launch(share=share)
|
574 |
|
575 |
+
if __name__ == "__main__":
|
576 |
+
launch_interface()
|