Chris4K commited on
Commit
cc69ccc
·
verified ·
1 Parent(s): c38e61c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +126 -17
app.py CHANGED
@@ -33,6 +33,10 @@ from functools import lru_cache
33
  from langchain.retrievers import MultiQueryRetriever
34
  from langchain.llms import HuggingFacePipeline
35
  from transformers import pipeline
 
 
 
 
36
 
37
  # NLTK Resource Download
38
  def download_nltk_resources():
@@ -548,8 +552,8 @@ from tqdm import tqdm
548
 
549
  # ... (previous code remains the same)
550
 
551
- # New function for automated testing
552
- def automated_testing(file, query, test_params):
553
  all_results = []
554
  all_stats = []
555
 
@@ -587,6 +591,7 @@ def automated_testing(file, query, test_params):
587
  params['search_type'],
588
  query,
589
  params['top_k'],
 
590
  params['lang'],
591
  params['apply_phonetic'],
592
  params['phonetic_weight']
@@ -596,7 +601,7 @@ def automated_testing(file, query, test_params):
596
  reranker = pipeline("text-classification", model="cross-encoder/ms-marco-MiniLM-L-12-v2")
597
  results_raw = rerank_results(results_raw, query, reranker)
598
 
599
- stats = calculate_statistics(results_raw, search_time, vector_store, num_tokens, embedding_model, query, params['top_k'])
600
  stats["model"] = f"{params['model_type']} - {params['model_name']}"
601
  stats.update(params)
602
 
@@ -605,30 +610,28 @@ def automated_testing(file, query, test_params):
605
 
606
  return pd.DataFrame(all_results), pd.DataFrame(all_stats)
607
 
 
608
  # Function to analyze results and propose best model and settings
609
  def analyze_results(stats_df):
610
- # Define weights for different metrics (adjust as needed)
611
  metric_weights = {
612
- 'search_time': -0.3, # Lower is better
613
  'result_diversity': 0.2,
614
  'rank_correlation': 0.3,
615
- 'silhouette_score': 0.2
 
 
616
  }
617
 
618
- # Convert relevant columns to numeric type
619
  for metric in metric_weights.keys():
620
  stats_df[metric] = pd.to_numeric(stats_df[metric], errors='coerce')
621
 
622
- # Calculate weighted score for each configuration
623
  stats_df['weighted_score'] = sum(
624
  stats_df[metric].fillna(0) * weight
625
  for metric, weight in metric_weights.items()
626
  )
627
 
628
- # Get the best configuration
629
  best_config = stats_df.loc[stats_df['weighted_score'].idxmax()]
630
 
631
- # Generate recommendations
632
  recommendations = {
633
  'best_model': f"{best_config['model_type']} - {best_config['model_name']}",
634
  'best_settings': {
@@ -640,19 +643,103 @@ def analyze_results(stats_df):
640
  'top_k': int(best_config['top_k']),
641
  'optimize_vocab': bool(best_config['optimize_vocab']),
642
  'use_query_optimization': bool(best_config['use_query_optimization']),
643
- 'use_reranking': bool(best_config['use_reranking'])
 
 
 
 
644
  },
645
  'performance_summary': {
646
  'search_time': float(best_config['search_time']),
647
  'result_diversity': float(best_config['result_diversity']),
648
  'rank_correlation': float(best_config['rank_correlation']),
649
- 'silhouette_score': float(best_config['silhouette_score'])
 
 
650
  }
651
  }
652
 
653
  return recommendations
 
654
  ####
655
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
656
  # Gradio Interface
657
  def launch_interface(share=True):
658
  with gr.Blocks() as iface:
@@ -672,7 +759,7 @@ def launch_interface(share=True):
672
  label="Embedding Models"
673
  )
674
  top_k_input = gr.Slider(1, 10, step=1, value=5, label="Top K")
675
-
676
  with gr.Tab("Advanced"):
677
  custom_embedding_model_input = gr.Textbox(label="Custom Embedding Model (optional, format: type:name)")
678
  split_strategy_input = gr.Radio(choices=["token", "recursive"], label="Split Strategy", value="recursive")
@@ -696,10 +783,10 @@ def launch_interface(share=True):
696
  query_optimization_model_input = gr.Textbox(label="Query Optimization Model", value="google/flan-t5-base")
697
  use_reranking_input = gr.Checkbox(label="Use Reranking", value=False)
698
 
699
- ####
700
  with gr.Tab("Automation"):
701
  auto_file_input = gr.File(label="Upload File (Optional)")
702
  auto_query_input = gr.Textbox(label="Search Query")
 
703
  auto_model_types = gr.CheckboxGroup(
704
  choices=["HuggingFace", "OpenAI", "Cohere"],
705
  label="Model Types to Test"
@@ -724,13 +811,35 @@ def launch_interface(share=True):
724
  auto_use_query_optimization = gr.Checkbox(label="Test Query Optimization", value=True)
725
  auto_use_reranking = gr.Checkbox(label="Test Reranking", value=True)
726
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
727
  results_output = gr.Dataframe(label="Results", interactive=False)
728
  stats_output = gr.Dataframe(label="Statistics", interactive=False)
729
  plot_output = gr.Plot(label="Visualizations")
 
730
 
731
  submit_button = gr.Button("Compare Embeddings")
732
  submit_button.click(
733
- fn=compare_embeddings,
734
  inputs=[
735
  file_input, query_input, embedding_models_input, custom_embedding_model_input,
736
  split_strategy_input, chunk_size_input, overlap_size_input, custom_separators_input,
@@ -740,7 +849,7 @@ def launch_interface(share=True):
740
  custom_tokenizer_vocab_size_input, custom_tokenizer_special_tokens_input,
741
  use_query_optimization_input, query_optimization_model_input, use_reranking_input
742
  ],
743
- outputs=[results_output, stats_output, plot_output]
744
  )
745
 
746
  auto_results_output = gr.Dataframe(label="Automated Test Results", interactive=False)
@@ -751,7 +860,7 @@ def launch_interface(share=True):
751
  auto_submit_button.click(
752
  fn=lambda *args: run_automated_tests_and_analyze(*args),
753
  inputs=[
754
- auto_file_input, auto_query_input, auto_model_types, auto_model_names,
755
  auto_split_strategies, auto_chunk_sizes, auto_overlap_sizes,
756
  auto_vector_store_types, auto_search_types, auto_top_k,
757
  auto_optimize_vocab, auto_use_query_optimization, auto_use_reranking
 
33
  from langchain.retrievers import MultiQueryRetriever
34
  from langchain.llms import HuggingFacePipeline
35
  from transformers import pipeline
36
+ from sklearn.model_selection import ParameterGrid
37
+ from tqdm import tqdm
38
+ import random
39
+
40
 
41
  # NLTK Resource Download
42
  def download_nltk_resources():
 
552
 
553
  # ... (previous code remains the same)
554
 
555
+ # function for automated testing
556
+ def automated_testing(file, query, test_params, expected_result=None):
557
  all_results = []
558
  all_stats = []
559
 
 
591
  params['search_type'],
592
  query,
593
  params['top_k'],
594
+ expected_result,
595
  params['lang'],
596
  params['apply_phonetic'],
597
  params['phonetic_weight']
 
601
  reranker = pipeline("text-classification", model="cross-encoder/ms-marco-MiniLM-L-12-v2")
602
  results_raw = rerank_results(results_raw, query, reranker)
603
 
604
+ stats = calculate_statistics(results_raw, search_time, vector_store, num_tokens, embedding_model, query, params['top_k'], expected_result)
605
  stats["model"] = f"{params['model_type']} - {params['model_name']}"
606
  stats.update(params)
607
 
 
610
 
611
  return pd.DataFrame(all_results), pd.DataFrame(all_stats)
612
 
613
+
614
  # Function to analyze results and propose best model and settings
615
  def analyze_results(stats_df):
 
616
  metric_weights = {
617
+ 'search_time': -0.3,
618
  'result_diversity': 0.2,
619
  'rank_correlation': 0.3,
620
+ 'silhouette_score': 0.2,
621
+ 'contains_expected': 0.5, # High weight for containing the expected result
622
+ 'expected_result_rank': -0.4 # Lower rank (closer to 1) is better
623
  }
624
 
 
625
  for metric in metric_weights.keys():
626
  stats_df[metric] = pd.to_numeric(stats_df[metric], errors='coerce')
627
 
 
628
  stats_df['weighted_score'] = sum(
629
  stats_df[metric].fillna(0) * weight
630
  for metric, weight in metric_weights.items()
631
  )
632
 
 
633
  best_config = stats_df.loc[stats_df['weighted_score'].idxmax()]
634
 
 
635
  recommendations = {
636
  'best_model': f"{best_config['model_type']} - {best_config['model_name']}",
637
  'best_settings': {
 
643
  'top_k': int(best_config['top_k']),
644
  'optimize_vocab': bool(best_config['optimize_vocab']),
645
  'use_query_optimization': bool(best_config['use_query_optimization']),
646
+ 'use_reranking': bool(best_config['use_reranking']),
647
+ 'lang': best_config['lang'],
648
+ 'apply_preprocessing': bool(best_config['apply_preprocessing']),
649
+ 'apply_phonetic': bool(best_config['apply_phonetic']),
650
+ 'phonetic_weight': float(best_config['phonetic_weight'])
651
  },
652
  'performance_summary': {
653
  'search_time': float(best_config['search_time']),
654
  'result_diversity': float(best_config['result_diversity']),
655
  'rank_correlation': float(best_config['rank_correlation']),
656
+ 'silhouette_score': float(best_config['silhouette_score']),
657
+ 'contains_expected': bool(best_config['contains_expected']),
658
+ 'expected_result_rank': int(best_config['expected_result_rank'])
659
  }
660
  }
661
 
662
  return recommendations
663
+
664
  ####
665
 
666
+ def get_llm_suggested_settings(file, num_chunks=5):
667
+ chunks, _, _ = process_files(file.name if file else None, 'HuggingFace', 'paraphrase-miniLM', 'recursive', 500, 50)
668
+
669
+ # Select a few random chunks
670
+ sample_chunks = random.sample(chunks, min(num_chunks, len(chunks)))
671
+
672
+ # Prepare the prompt
673
+ prompt = f"""Given the following text chunks from a document, suggest optimal settings for an embedding-based search system. The settings should include:
674
+
675
+ 1. Embedding model type and name
676
+ 2. Split strategy (token or recursive)
677
+ 3. Chunk size
678
+ 4. Overlap size
679
+ 5. Vector store type (FAISS or Chroma)
680
+ 6. Search type (similarity, mmr, or custom)
681
+ 7. Top K results to retrieve
682
+ 8. Whether to apply preprocessing
683
+ 9. Whether to optimize vocabulary
684
+ 10. Whether to apply phonetic matching
685
+
686
+ Text chunks:
687
+ {' '.join(sample_chunks)}
688
+
689
+ Provide your suggestions in a Python dictionary format."""
690
+
691
+ # Use a HuggingFace model for text generation
692
+ llm = HuggingFacePipeline.from_model_id(
693
+ model_id="google/flan-t5-large",
694
+ task="text2text-generation",
695
+ model_kwargs={"temperature": 0.7, "max_length": 512},
696
+ )
697
+
698
+ # Generate suggestions
699
+ suggested_settings = llm(prompt)
700
+
701
+ # Parse the generated text to extract the dictionary
702
+ # Note: This assumes the LLM generates a valid Python dictionary. In practice, you might need more robust parsing.
703
+ try:
704
+ settings_dict = eval(suggested_settings)
705
+ return {
706
+ "embedding_models": f"{settings_dict['embedding_model_type']}:{settings_dict['embedding_model_name']}",
707
+ "split_strategy": settings_dict["split_strategy"],
708
+ "chunk_size": settings_dict["chunk_size"],
709
+ "overlap_size": settings_dict["overlap_size"],
710
+ "vector_store_type": settings_dict["vector_store_type"],
711
+ "search_type": settings_dict["search_type"],
712
+ "top_k": settings_dict["top_k"],
713
+ "apply_preprocessing": settings_dict["apply_preprocessing"],
714
+ "optimize_vocab": settings_dict["optimize_vocabulary"],
715
+ "apply_phonetic": settings_dict["apply_phonetic_matching"],
716
+ "phonetic_weight": 0.3 # Default value, as it's not in the LLM suggestions
717
+ }
718
+ except:
719
+ return {"error": "Failed to parse LLM suggestions"}
720
+
721
+ return settings_dict
722
+
723
+
724
+ def update_inputs_with_llm_suggestions(suggestions):
725
+ if "error" in suggestions:
726
+ return [gr.update() for _ in range(11)] # Return no updates if there's an error
727
+
728
+ return [
729
+ gr.update(value=[suggestions["embedding_models"]]), # embedding_models_input
730
+ gr.update(value=suggestions["split_strategy"]), # split_strategy_input
731
+ gr.update(value=suggestions["chunk_size"]), # chunk_size_input
732
+ gr.update(value=suggestions["overlap_size"]), # overlap_size_input
733
+ gr.update(value=suggestions["vector_store_type"]), # vector_store_type_input
734
+ gr.update(value=suggestions["search_type"]), # search_type_input
735
+ gr.update(value=suggestions["top_k"]), # top_k_input
736
+ gr.update(value=suggestions["apply_preprocessing"]), # apply_preprocessing_input
737
+ gr.update(value=suggestions["optimize_vocab"]), # optimize_vocab_input
738
+ gr.update(value=suggestions["apply_phonetic"]), # apply_phonetic_input
739
+ gr.update(value=suggestions["phonetic_weight"]) # phonetic_weight_input
740
+ ]
741
+
742
+
743
  # Gradio Interface
744
  def launch_interface(share=True):
745
  with gr.Blocks() as iface:
 
759
  label="Embedding Models"
760
  )
761
  top_k_input = gr.Slider(1, 10, step=1, value=5, label="Top K")
762
+
763
  with gr.Tab("Advanced"):
764
  custom_embedding_model_input = gr.Textbox(label="Custom Embedding Model (optional, format: type:name)")
765
  split_strategy_input = gr.Radio(choices=["token", "recursive"], label="Split Strategy", value="recursive")
 
783
  query_optimization_model_input = gr.Textbox(label="Query Optimization Model", value="google/flan-t5-base")
784
  use_reranking_input = gr.Checkbox(label="Use Reranking", value=False)
785
 
 
786
  with gr.Tab("Automation"):
787
  auto_file_input = gr.File(label="Upload File (Optional)")
788
  auto_query_input = gr.Textbox(label="Search Query")
789
+ auto_expected_result_input = gr.Textbox(label="Expected Result (Optional)")
790
  auto_model_types = gr.CheckboxGroup(
791
  choices=["HuggingFace", "OpenAI", "Cohere"],
792
  label="Model Types to Test"
 
811
  auto_use_query_optimization = gr.Checkbox(label="Test Query Optimization", value=True)
812
  auto_use_reranking = gr.Checkbox(label="Test Reranking", value=True)
813
 
814
+ with gr.Tab("LLM Suggestions"):
815
+ llm_file_input = gr.File(label="Upload File for LLM Suggestions")
816
+ llm_num_chunks = gr.Slider(1, 10, step=1, value=5, label="Number of Sample Chunks")
817
+ llm_suggest_button = gr.Button("Get LLM Suggestions")
818
+ llm_suggestions_output = gr.JSON(label="LLM-suggested Settings")
819
+
820
+ llm_suggest_button.click(
821
+ fn=get_llm_suggested_settings,
822
+ inputs=[llm_file_input, llm_num_chunks],
823
+ outputs=[llm_suggestions_output]
824
+ ).then(
825
+ fn=update_inputs_with_llm_suggestions,
826
+ inputs=[llm_suggestions_output],
827
+ outputs=[
828
+ embedding_models_input, split_strategy_input, chunk_size_input,
829
+ overlap_size_input, vector_store_type_input, search_type_input,
830
+ top_k_input, apply_preprocessing_input, optimize_vocab_input,
831
+ apply_phonetic_input, phonetic_weight_input
832
+ ]
833
+ )
834
+
835
  results_output = gr.Dataframe(label="Results", interactive=False)
836
  stats_output = gr.Dataframe(label="Statistics", interactive=False)
837
  plot_output = gr.Plot(label="Visualizations")
838
+ best_settings_output = gr.JSON(label="Best Settings")
839
 
840
  submit_button = gr.Button("Compare Embeddings")
841
  submit_button.click(
842
+ fn=lambda *args: compare_and_show_best(*args),
843
  inputs=[
844
  file_input, query_input, embedding_models_input, custom_embedding_model_input,
845
  split_strategy_input, chunk_size_input, overlap_size_input, custom_separators_input,
 
849
  custom_tokenizer_vocab_size_input, custom_tokenizer_special_tokens_input,
850
  use_query_optimization_input, query_optimization_model_input, use_reranking_input
851
  ],
852
+ outputs=[results_output, stats_output, plot_output, best_settings_output]
853
  )
854
 
855
  auto_results_output = gr.Dataframe(label="Automated Test Results", interactive=False)
 
860
  auto_submit_button.click(
861
  fn=lambda *args: run_automated_tests_and_analyze(*args),
862
  inputs=[
863
+ auto_file_input, auto_query_input, auto_expected_result_input, auto_model_types, auto_model_names,
864
  auto_split_strategies, auto_chunk_sizes, auto_overlap_sizes,
865
  auto_vector_store_types, auto_search_types, auto_top_k,
866
  auto_optimize_vocab, auto_use_query_optimization, auto_use_reranking