Chris4K commited on
Commit
950a593
·
verified ·
1 Parent(s): 4e5a67d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +99 -85
app.py CHANGED
@@ -30,6 +30,9 @@ from sklearn.manifold import TSNE
30
  from sklearn.metrics import silhouette_score
31
  from scipy.stats import spearmanr
32
  from functools import lru_cache
 
 
 
33
 
34
  # NLTK Resource Download
35
  def download_nltk_resources():
@@ -141,13 +144,11 @@ def preprocess_text(text, lang='german'):
141
  def phonetic_match(text, query, method='levenshtein_distance'):
142
  if method == 'levenshtein_distance':
143
  text_phonetic = jellyfish.soundex(text)
144
- #query_phonetic = jellyfish.cologne_phonetic(query)
145
  query_phonetic = jellyfish.soundex(query)
146
  return jellyfish.levenshtein_distance(text_phonetic, query_phonetic)
147
  return 0
148
 
149
  def create_custom_embedding(texts, model_type='word2vec', vector_size=100, window=5, min_count=1):
150
- # Tokenize the texts
151
  tokenized_texts = [text.split() for text in texts]
152
 
153
  if model_type == 'word2vec':
@@ -169,7 +170,6 @@ class CustomEmbeddings(HuggingFaceEmbeddings):
169
  def embed_query(self, text):
170
  return self.model.wv[text.split()]
171
 
172
-
173
  # Custom Tokenizer
174
  def create_custom_tokenizer(file_path, model_type='WordLevel', vocab_size=10000, special_tokens=None):
175
  with open(file_path, 'r', encoding='utf-8') as f:
@@ -191,6 +191,7 @@ def create_custom_tokenizer(file_path, model_type='WordLevel', vocab_size=10000,
191
  tokenizer.train_from_iterator([text], trainer)
192
 
193
  return tokenizer
 
194
  def custom_tokenize(text, tokenizer):
195
  return tokenizer.encode(text).tokens
196
 
@@ -220,15 +221,16 @@ def get_text_splitter(split_strategy, chunk_size, overlap_size, custom_separator
220
  raise ValueError(f"Unsupported split strategy: {split_strategy}")
221
 
222
  def get_vector_store(vector_store_type, chunks, embedding_model):
223
- # Convert chunks to a tuple to make it hashable
224
  chunks_tuple = tuple(chunks)
225
-
226
- # Use a helper function for the actual vector store creation
227
  return _create_vector_store(vector_store_type, chunks_tuple, embedding_model)
228
 
229
-
 
 
 
 
 
230
  def _create_vector_store(vector_store_type, chunks_tuple, embedding_model):
231
- # Convert the tuple back to a list for use with the vector store
232
  chunks = list(chunks_tuple)
233
 
234
  if vector_store_type == 'FAISS':
@@ -238,15 +240,13 @@ def _create_vector_store(vector_store_type, chunks_tuple, embedding_model):
238
  else:
239
  raise ValueError(f"Unsupported vector store type: {vector_store_type}")
240
 
241
-
242
  def get_retriever(vector_store, search_type, search_kwargs):
243
  if search_type == 'similarity':
244
  return vector_store.as_retriever(search_type="similarity", search_kwargs=search_kwargs)
245
  elif search_type == 'mmr':
246
  return vector_store.as_retriever(search_type="mmr", search_kwargs=search_kwargs)
247
  elif search_type == 'custom':
248
- # Implement custom retriever logic here
249
- pass
250
  else:
251
  raise ValueError(f"Unsupported search type: {search_type}")
252
 
@@ -290,15 +290,13 @@ def search_embeddings(chunks, embedding_model, vector_store_type, search_type, q
290
  results = sorted(results, key=score_result, reverse=True)
291
  end_time = time.time()
292
 
293
- # Check if embeddings are available
294
  embeddings = []
295
  for doc in results:
296
  if hasattr(doc, 'embedding'):
297
- embeddings.append(doc.embedding) # Use the embedding if it exists
298
  else:
299
- embeddings.append(None) # Append None if embedding doesn't exist
300
 
301
- # Create a DataFrame with the results and embeddings
302
  results_df = pd.DataFrame({
303
  'content': [doc.page_content for doc in results],
304
  'embedding': embeddings
@@ -307,13 +305,12 @@ def search_embeddings(chunks, embedding_model, vector_store_type, search_type, q
307
  return results_df, end_time - start_time, vector_store, results
308
 
309
  # Evaluation Metrics
 
 
310
  def calculate_statistics(results, search_time, vector_store, num_tokens, embedding_model, query, top_k):
311
  stats = {
312
  "num_results": len(results),
313
- # "avg_content_length": sum(len(doc.page_content) for doc in results) / len(results) if results else 0,
314
  "avg_content_length": np.mean([len(doc.page_content) for doc in results]) if results else 0,
315
-
316
- #"avg_content_length": np.mean([len(doc.page_content) for doc in results]) if not results.empty else 0,
317
  "search_time": search_time,
318
  "vector_store_size": vector_store._index.ntotal if hasattr(vector_store, '_index') else "N/A",
319
  "num_documents": len(vector_store.docstore._dict),
@@ -328,10 +325,7 @@ def calculate_statistics(results, search_time, vector_store, num_tokens, embeddi
328
  pairwise_similarities = np.inner(embeddings, embeddings)
329
  stats["result_diversity"] = 1 - np.mean(pairwise_similarities[np.triu_indices(len(embeddings), k=1)])
330
 
331
- # Silhouette Score
332
  if len(embeddings) > 2:
333
- print('-----')
334
- #stats["silhouette_score"] = "N/A"
335
  stats["silhouette_score"] = silhouette_score(embeddings, range(len(embeddings)))
336
  else:
337
  stats["silhouette_score"] = "N/A"
@@ -378,24 +372,34 @@ def visualize_results(results_df, stats_df):
378
  def optimize_vocabulary(texts, vocab_size=10000, min_frequency=2):
379
  tokenizer = Tokenizer(models.BPE(unk_token="[UNK]"))
380
 
381
- # Count word frequencies
382
  word_freq = Counter(word for text in texts for word in text.split())
383
 
384
- # Remove rare words
385
  optimized_texts = [
386
  ' '.join(word for word in text.split() if word_freq[word] >= min_frequency)
387
  for text in texts
388
  ]
389
 
390
- # Train BPE tokenizer
391
- # tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
392
  trainer = trainers.BpeTrainer(vocab_size=vocab_size, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
393
  tokenizer.train_from_iterator(optimized_texts, trainer)
394
 
395
  return tokenizer, optimized_texts
396
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
397
  # Main Comparison Function
398
- def compare_embeddings(file, query, embedding_models, custom_embedding_model, split_strategy, chunk_size, overlap_size, custom_separators, vector_store_type, search_type, top_k, lang='german', optimize_vocab=False, phonetic_weight=0.3, custom_tokenizer_file=None, custom_tokenizer_model=None, custom_tokenizer_vocab_size=10000, custom_tokenizer_special_tokens=None):
399
  all_results = []
400
  all_stats = []
401
  settings = {
@@ -408,16 +412,16 @@ def compare_embeddings(file, query, embedding_models, custom_embedding_model, sp
408
  "top_k": top_k,
409
  "lang": lang,
410
  "optimize_vocab": optimize_vocab,
411
- "phonetic_weight": phonetic_weight
 
 
412
  }
413
 
414
- # Parse embedding models
415
  models = [model.strip().split(':') for model in embedding_models.split(',')]
416
  if custom_embedding_model:
417
  models.append(custom_embedding_model.strip().split(':'))
418
 
419
  for model_type, model_name in models:
420
- # Process the file and generate chunks & embeddings
421
  chunks, embedding_model, num_tokens = process_files(
422
  file.name if file else None,
423
  model_type,
@@ -433,17 +437,19 @@ def compare_embeddings(file, query, embedding_models, custom_embedding_model, sp
433
  custom_tokenizer_special_tokens.split(',') if custom_tokenizer_special_tokens else None
434
  )
435
 
436
- # Custom embedding handling
437
- #if use_custom_embedding:
438
- # custom_model = create_custom_embedding(chunks) #add custom model by name, must com from gradio FE
439
- # embedding_model = CustomEmbeddings(custom_model)
440
-
441
- # Optimizing vocabulary if required
442
  if optimize_vocab:
443
  tokenizer, optimized_chunks = optimize_vocabulary(chunks)
444
  chunks = optimized_chunks
445
 
446
- # Searching embeddings
 
 
 
 
 
 
 
 
447
  results, search_time, vector_store, results_raw = search_embeddings(
448
  chunks,
449
  embedding_model,
@@ -455,32 +461,26 @@ def compare_embeddings(file, query, embedding_models, custom_embedding_model, sp
455
  phonetic_weight
456
  )
457
 
458
- # Storing embeddings into the results for future use
459
- for doc in results_raw:
460
- print(doc) # or print(dir(doc)) to see available attributes
461
 
462
- #embedding = doc.metadata.get('embedding', None) # Use .get() to avoid KeyError
463
-
464
- result_embeddings = [doc.metadata.get('embedding', None) for doc in results_raw] # Adjust this based on the actual attribute names
465
- # result_embeddings = [doc['embedding'] for doc in results_raw] # Assuming each result has an embedding
466
 
467
  stats = calculate_statistics(results_raw, search_time, vector_store, num_tokens, embedding_model, query, top_k)
468
  stats["model"] = f"{model_type} - {model_name}"
469
  stats.update(settings)
470
 
471
- # Formatting results and attaching embeddings
472
  formatted_results = format_results(results_raw, stats)
473
  for i, result in enumerate(formatted_results):
474
- result['embedding'] = result_embeddings[i] # Add the embedding to each result
475
 
476
  all_results.extend(formatted_results)
477
  all_stats.append(stats)
478
 
479
- # Create DataFrames with embeddings now included
480
  results_df = pd.DataFrame(all_results)
481
  stats_df = pd.DataFrame(all_stats)
482
 
483
- # Visualization of the results
484
  fig = visualize_results(results_df, stats_df)
485
 
486
  return results_df, stats_df, fig
@@ -500,36 +500,52 @@ def format_results(results, stats):
500
 
501
  # Gradio Interface
502
  def launch_interface(share=True):
503
- iface = gr.Interface(
504
- fn=compare_embeddings,
505
- inputs=[
506
- gr.File(label="Upload File (Optional)"),
507
- gr.Textbox(label="Search Query"),
508
- gr.Textbox(label="Embedding Models (comma-separated, e.g. HuggingFace:paraphrase-miniLM,OpenAI:text-embedding-ada-002)"),
509
- gr.Textbox(label="Custom Embedding Model (optional, format: type:name)"),
510
- gr.Radio(choices=["token", "recursive"], label="Split Strategy", value="recursive"),
511
- gr.Slider(100, 1000, step=100, value=500, label="Chunk Size"),
512
- gr.Slider(0, 100, step=10, value=50, label="Overlap Size"),
513
- gr.Textbox(label="Custom Split Separators (comma-separated, optional)"),
514
- gr.Radio(choices=["FAISS", "Chroma"], label="Vector Store Type", value="FAISS"),
515
- gr.Radio(choices=["similarity", "mmr", "custom"], label="Search Type", value="similarity"),
516
- gr.Slider(1, 10, step=1, value=5, label="Top K"),
517
- gr.Dropdown(choices=["german", "english", "french"], label="Language", value="german"),
518
- gr.Checkbox(label="Optimize Vocabulary", value=False),
519
- gr.Slider(0, 1, step=0.1, value=0.3, label="Phonetic Matching Weight"),
520
- gr.File(label="Custom Tokenizer File (Optional)"),
521
- gr.Textbox(label="Custom Tokenizer Model (e.g., WordLevel, BPE, Unigram)"),
522
- gr.Textbox(label="Custom Tokenizer Vocab Size", value="10000"),
523
- gr.Textbox(label="Custom Tokenizer Special Tokens (comma-separated)")
524
- ],
525
- outputs=[
526
- gr.Dataframe(label="Results", interactive=False),
527
- gr.Dataframe(label="Statistics", interactive=False),
528
- gr.Plot(label="Visualizations")
529
- ],
530
- title="Advanced Embedding Comparison Tool",
531
- description="Compare different embedding models and retrieval strategies with advanced preprocessing and phonetic matching"
532
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
533
 
534
  tutorial_md = """
535
  # Advanced Embedding Comparison Tool Tutorial
@@ -541,13 +557,10 @@ def launch_interface(share=True):
541
  1. Upload a file (optional) or use the default files in the system.
542
  2. Enter a search query.
543
  3. Enter embedding models as a comma-separated list (e.g., HuggingFace:paraphrase-miniLM,OpenAI:text-embedding-ada-002).
544
- 4. Optionally, specify a custom embedding model in the format type:name.
545
- 5. Choose a text splitting strategy and set chunk size and overlap.
546
- 6. Select a vector store type and search type.
547
- 7. Set the number of top results to retrieve.
548
- 8. Choose the language of your documents.
549
- 9. Optionally, optimize vocabulary or adjust phonetic matching weight.
550
- 10. If you have a custom tokenizer, upload the file and specify its attributes.
551
 
552
  The tool will process your query and display results, statistics, and visualizations to help you compare the performance of different models and strategies.
553
  """
@@ -559,4 +572,5 @@ def launch_interface(share=True):
559
 
560
  iface.launch(share=share)
561
 
562
- launch_interface()
 
 
30
  from sklearn.metrics import silhouette_score
31
  from scipy.stats import spearmanr
32
  from functools import lru_cache
33
+ from langchain.retrievers import MultiQueryRetriever
34
+ from langchain.llms import HuggingFacePipeline
35
+ from transformers import pipeline
36
 
37
  # NLTK Resource Download
38
  def download_nltk_resources():
 
144
  def phonetic_match(text, query, method='levenshtein_distance'):
145
  if method == 'levenshtein_distance':
146
  text_phonetic = jellyfish.soundex(text)
 
147
  query_phonetic = jellyfish.soundex(query)
148
  return jellyfish.levenshtein_distance(text_phonetic, query_phonetic)
149
  return 0
150
 
151
  def create_custom_embedding(texts, model_type='word2vec', vector_size=100, window=5, min_count=1):
 
152
  tokenized_texts = [text.split() for text in texts]
153
 
154
  if model_type == 'word2vec':
 
170
  def embed_query(self, text):
171
  return self.model.wv[text.split()]
172
 
 
173
  # Custom Tokenizer
174
  def create_custom_tokenizer(file_path, model_type='WordLevel', vocab_size=10000, special_tokens=None):
175
  with open(file_path, 'r', encoding='utf-8') as f:
 
191
  tokenizer.train_from_iterator([text], trainer)
192
 
193
  return tokenizer
194
+
195
  def custom_tokenize(text, tokenizer):
196
  return tokenizer.encode(text).tokens
197
 
 
221
  raise ValueError(f"Unsupported split strategy: {split_strategy}")
222
 
223
  def get_vector_store(vector_store_type, chunks, embedding_model):
 
224
  chunks_tuple = tuple(chunks)
 
 
225
  return _create_vector_store(vector_store_type, chunks_tuple, embedding_model)
226
 
227
+ def custom_similarity(query_embedding, doc_embedding, query, doc_text, phonetic_weight=0.3):
228
+ embedding_sim = np.dot(query_embedding, doc_embedding) / (np.linalg.norm(query_embedding) * np.linalg.norm(doc_embedding))
229
+ phonetic_sim = phonetic_match(doc_text, query)
230
+ combined_sim = (1 - phonetic_weight) * embedding_sim + phonetic_weight * phonetic_sim
231
+ return combined_sim
232
+
233
  def _create_vector_store(vector_store_type, chunks_tuple, embedding_model):
 
234
  chunks = list(chunks_tuple)
235
 
236
  if vector_store_type == 'FAISS':
 
240
  else:
241
  raise ValueError(f"Unsupported vector store type: {vector_store_type}")
242
 
 
243
  def get_retriever(vector_store, search_type, search_kwargs):
244
  if search_type == 'similarity':
245
  return vector_store.as_retriever(search_type="similarity", search_kwargs=search_kwargs)
246
  elif search_type == 'mmr':
247
  return vector_store.as_retriever(search_type="mmr", search_kwargs=search_kwargs)
248
  elif search_type == 'custom':
249
+ return vector_store.as_retriever(search_type="similarity", search_kwargs=search_kwargs)
 
250
  else:
251
  raise ValueError(f"Unsupported search type: {search_type}")
252
 
 
290
  results = sorted(results, key=score_result, reverse=True)
291
  end_time = time.time()
292
 
 
293
  embeddings = []
294
  for doc in results:
295
  if hasattr(doc, 'embedding'):
296
+ embeddings.append(doc.embedding)
297
  else:
298
+ embeddings.append(None)
299
 
 
300
  results_df = pd.DataFrame({
301
  'content': [doc.page_content for doc in results],
302
  'embedding': embeddings
 
305
  return results_df, end_time - start_time, vector_store, results
306
 
307
  # Evaluation Metrics
308
+ # ... (previous code remains the same)
309
+
310
  def calculate_statistics(results, search_time, vector_store, num_tokens, embedding_model, query, top_k):
311
  stats = {
312
  "num_results": len(results),
 
313
  "avg_content_length": np.mean([len(doc.page_content) for doc in results]) if results else 0,
 
 
314
  "search_time": search_time,
315
  "vector_store_size": vector_store._index.ntotal if hasattr(vector_store, '_index') else "N/A",
316
  "num_documents": len(vector_store.docstore._dict),
 
325
  pairwise_similarities = np.inner(embeddings, embeddings)
326
  stats["result_diversity"] = 1 - np.mean(pairwise_similarities[np.triu_indices(len(embeddings), k=1)])
327
 
 
328
  if len(embeddings) > 2:
 
 
329
  stats["silhouette_score"] = silhouette_score(embeddings, range(len(embeddings)))
330
  else:
331
  stats["silhouette_score"] = "N/A"
 
372
  def optimize_vocabulary(texts, vocab_size=10000, min_frequency=2):
373
  tokenizer = Tokenizer(models.BPE(unk_token="[UNK]"))
374
 
 
375
  word_freq = Counter(word for text in texts for word in text.split())
376
 
 
377
  optimized_texts = [
378
  ' '.join(word for word in text.split() if word_freq[word] >= min_frequency)
379
  for text in texts
380
  ]
381
 
 
 
382
  trainer = trainers.BpeTrainer(vocab_size=vocab_size, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
383
  tokenizer.train_from_iterator(optimized_texts, trainer)
384
 
385
  return tokenizer, optimized_texts
386
 
387
+ # New preprocessing function
388
+ def optimize_query(query, llm):
389
+ multi_query_retriever = MultiQueryRetriever.from_llm(
390
+ retriever=get_retriever(vector_store, search_type, search_kwargs),
391
+ llm=llm
392
+ )
393
+ optimized_queries = multi_query_retriever.generate_queries(query)
394
+ return optimized_queries
395
+
396
+ # New postprocessing function
397
+ def rerank_results(results, query, reranker):
398
+ reranked_results = reranker.rerank(query, [doc.page_content for doc in results])
399
+ return reranked_results
400
+
401
  # Main Comparison Function
402
+ def compare_embeddings(file, query, embedding_models, custom_embedding_model, split_strategy, chunk_size, overlap_size, custom_separators, vector_store_type, search_type, top_k, lang='german', optimize_vocab=False, phonetic_weight=0.3, custom_tokenizer_file=None, custom_tokenizer_model=None, custom_tokenizer_vocab_size=10000, custom_tokenizer_special_tokens=None, use_query_optimization=False, use_reranking=False):
403
  all_results = []
404
  all_stats = []
405
  settings = {
 
412
  "top_k": top_k,
413
  "lang": lang,
414
  "optimize_vocab": optimize_vocab,
415
+ "phonetic_weight": phonetic_weight,
416
+ "use_query_optimization": use_query_optimization,
417
+ "use_reranking": use_reranking
418
  }
419
 
 
420
  models = [model.strip().split(':') for model in embedding_models.split(',')]
421
  if custom_embedding_model:
422
  models.append(custom_embedding_model.strip().split(':'))
423
 
424
  for model_type, model_name in models:
 
425
  chunks, embedding_model, num_tokens = process_files(
426
  file.name if file else None,
427
  model_type,
 
437
  custom_tokenizer_special_tokens.split(',') if custom_tokenizer_special_tokens else None
438
  )
439
 
 
 
 
 
 
 
440
  if optimize_vocab:
441
  tokenizer, optimized_chunks = optimize_vocabulary(chunks)
442
  chunks = optimized_chunks
443
 
444
+ if use_query_optimization:
445
+ llm = HuggingFacePipeline.from_model_id(
446
+ model_id="google/flan-t5-base",
447
+ task="text2text-generation",
448
+ model_kwargs={"temperature": 0, "max_length": 64},
449
+ )
450
+ optimized_queries = optimize_query(query, llm)
451
+ query = " ".join(optimized_queries)
452
+
453
  results, search_time, vector_store, results_raw = search_embeddings(
454
  chunks,
455
  embedding_model,
 
461
  phonetic_weight
462
  )
463
 
464
+ if use_reranking:
465
+ reranker = pipeline("text-classification", model="cross-encoder/ms-marco-MiniLM-L-12-v2")
466
+ results_raw = rerank_results(results_raw, query, reranker)
467
 
468
+ result_embeddings = [doc.metadata.get('embedding', None) for doc in results_raw]
 
 
 
469
 
470
  stats = calculate_statistics(results_raw, search_time, vector_store, num_tokens, embedding_model, query, top_k)
471
  stats["model"] = f"{model_type} - {model_name}"
472
  stats.update(settings)
473
 
 
474
  formatted_results = format_results(results_raw, stats)
475
  for i, result in enumerate(formatted_results):
476
+ result['embedding'] = result_embeddings[i]
477
 
478
  all_results.extend(formatted_results)
479
  all_stats.append(stats)
480
 
 
481
  results_df = pd.DataFrame(all_results)
482
  stats_df = pd.DataFrame(all_stats)
483
 
 
484
  fig = visualize_results(results_df, stats_df)
485
 
486
  return results_df, stats_df, fig
 
500
 
501
  # Gradio Interface
502
  def launch_interface(share=True):
503
+ with gr.Blocks() as iface:
504
+ gr.Markdown("# Advanced Embedding Comparison Tool")
505
+
506
+ with gr.Tab("Simple"):
507
+ file_input = gr.File(label="Upload File (Optional)")
508
+ query_input = gr.Textbox(label="Search Query")
509
+ embedding_models_input = gr.Textbox(label="Embedding Models (comma-separated, e.g. HuggingFace:paraphrase-miniLM,OpenAI:text-embedding-ada-002)")
510
+ top_k_input = gr.Slider(1, 10, step=1, value=5, label="Top K")
511
+
512
+ with gr.Tab("Advanced"):
513
+ custom_embedding_model_input = gr.Textbox(label="Custom Embedding Model (optional, format: type:name)")
514
+ split_strategy_input = gr.Radio(choices=["token", "recursive"], label="Split Strategy", value="recursive")
515
+ chunk_size_input = gr.Slider(100, 1000, step=100, value=500, label="Chunk Size")
516
+ overlap_size_input = gr.Slider(0, 100, step=10, value=50, label="Overlap Size")
517
+ custom_separators_input = gr.Textbox(label="Custom Split Separators (comma-separated, optional)")
518
+ vector_store_type_input = gr.Radio(choices=["FAISS", "Chroma"], label="Vector Store Type", value="FAISS")
519
+ search_type_input = gr.Radio(choices=["similarity", "mmr", "custom"], label="Search Type", value="similarity")
520
+ lang_input = gr.Dropdown(choices=["german", "english", "french"], label="Language", value="german")
521
+
522
+ with gr.Tab("Optional"):
523
+ optimize_vocab_input = gr.Checkbox(label="Optimize Vocabulary", value=False)
524
+ phonetic_weight_input = gr.Slider(0, 1, step=0.1, value=0.3, label="Phonetic Matching Weight")
525
+ custom_tokenizer_file_input = gr.File(label="Custom Tokenizer File (Optional)")
526
+ custom_tokenizer_model_input = gr.Textbox(label="Custom Tokenizer Model (e.g., WordLevel, BPE, Unigram)")
527
+ custom_tokenizer_vocab_size_input = gr.Textbox(label="Custom Tokenizer Vocab Size", value="10000")
528
+ custom_tokenizer_special_tokens_input = gr.Textbox(label="Custom Tokenizer Special Tokens (comma-separated)")
529
+ use_query_optimization_input = gr.Checkbox(label="Use Query Optimization", value=False)
530
+ use_reranking_input = gr.Checkbox(label="Use Reranking", value=False)
531
+
532
+ results_output = gr.Dataframe(label="Results", interactive=False)
533
+ stats_output = gr.Dataframe(label="Statistics", interactive=False)
534
+ plot_output = gr.Plot(label="Visualizations")
535
+
536
+ submit_button = gr.Button("Compare Embeddings")
537
+ submit_button.click(
538
+ fn=compare_embeddings,
539
+ inputs=[
540
+ file_input, query_input, embedding_models_input, custom_embedding_model_input,
541
+ split_strategy_input, chunk_size_input, overlap_size_input, custom_separators_input,
542
+ vector_store_type_input, search_type_input, top_k_input, lang_input,
543
+ optimize_vocab_input, phonetic_weight_input, custom_tokenizer_file_input,
544
+ custom_tokenizer_model_input, custom_tokenizer_vocab_size_input,
545
+ custom_tokenizer_special_tokens_input, use_query_optimization_input, use_reranking_input
546
+ ],
547
+ outputs=[results_output, stats_output, plot_output]
548
+ )
549
 
550
  tutorial_md = """
551
  # Advanced Embedding Comparison Tool Tutorial
 
557
  1. Upload a file (optional) or use the default files in the system.
558
  2. Enter a search query.
559
  3. Enter embedding models as a comma-separated list (e.g., HuggingFace:paraphrase-miniLM,OpenAI:text-embedding-ada-002).
560
+ 4. Set the number of top results to retrieve.
561
+ 5. Optionally, specify advanced settings such as custom embedding models, text splitting strategies, and vector store types.
562
+ 6. Choose whether to use optional features like vocabulary optimization, query optimization, or result reranking.
563
+ 7. If you have a custom tokenizer, upload the file and specify its attributes.
 
 
 
564
 
565
  The tool will process your query and display results, statistics, and visualizations to help you compare the performance of different models and strategies.
566
  """
 
572
 
573
  iface.launch(share=share)
574
 
575
+ if __name__ == "__main__":
576
+ launch_interface()