More_Advanced_Embeddings_Comparator

Running

App Files Files Community

Chris4K commited on Oct 21, 2024

Commit

cc69ccc

verified ·

1 Parent(s): c38e61c

Update app.py

Browse files

Files changed (1) hide show

app.py +126 -17

app.py CHANGED Viewed

@@ -33,6 +33,10 @@ from functools import lru_cache
 from langchain.retrievers import MultiQueryRetriever
 from langchain.llms import HuggingFacePipeline
 from transformers import pipeline
 # NLTK Resource Download
 def download_nltk_resources():
@@ -548,8 +552,8 @@ from tqdm import tqdm
 # ... (previous code remains the same)
-# New function for automated testing
-def automated_testing(file, query, test_params):
     all_results = []
     all_stats = []
@@ -587,6 +591,7 @@ def automated_testing(file, query, test_params):
             params['search_type'],
             query,
             params['top_k'],
             params['lang'],
             params['apply_phonetic'],
             params['phonetic_weight']
@@ -596,7 +601,7 @@ def automated_testing(file, query, test_params):
             reranker = pipeline("text-classification", model="cross-encoder/ms-marco-MiniLM-L-12-v2")
             results_raw = rerank_results(results_raw, query, reranker)
-        stats = calculate_statistics(results_raw, search_time, vector_store, num_tokens, embedding_model, query, params['top_k'])
         stats["model"] = f"{params['model_type']} - {params['model_name']}"
         stats.update(params)
@@ -605,30 +610,28 @@ def automated_testing(file, query, test_params):
     return pd.DataFrame(all_results), pd.DataFrame(all_stats)
 # Function to analyze results and propose best model and settings
 def analyze_results(stats_df):
-    # Define weights for different metrics (adjust as needed)
     metric_weights = {
-        'search_time': -0.3,  # Lower is better
         'result_diversity': 0.2,
         'rank_correlation': 0.3,
-        'silhouette_score': 0.2
     }
-    # Convert relevant columns to numeric type
     for metric in metric_weights.keys():
         stats_df[metric] = pd.to_numeric(stats_df[metric], errors='coerce')
-    # Calculate weighted score for each configuration
     stats_df['weighted_score'] = sum(
         stats_df[metric].fillna(0) * weight
         for metric, weight in metric_weights.items()
     )
-    # Get the best configuration
     best_config = stats_df.loc[stats_df['weighted_score'].idxmax()]
-    # Generate recommendations
     recommendations = {
         'best_model': f"{best_config['model_type']} - {best_config['model_name']}",
         'best_settings': {
@@ -640,19 +643,103 @@ def analyze_results(stats_df):
             'top_k': int(best_config['top_k']),
             'optimize_vocab': bool(best_config['optimize_vocab']),
             'use_query_optimization': bool(best_config['use_query_optimization']),
-            'use_reranking': bool(best_config['use_reranking'])
         },
         'performance_summary': {
             'search_time': float(best_config['search_time']),
             'result_diversity': float(best_config['result_diversity']),
             'rank_correlation': float(best_config['rank_correlation']),
-            'silhouette_score': float(best_config['silhouette_score'])
         }
     }
     return recommendations
     ####
 # Gradio Interface
 def launch_interface(share=True):
     with gr.Blocks() as iface:
@@ -672,7 +759,7 @@ def launch_interface(share=True):
                 label="Embedding Models"
             )
             top_k_input = gr.Slider(1, 10, step=1, value=5, label="Top K")
         with gr.Tab("Advanced"):
             custom_embedding_model_input = gr.Textbox(label="Custom Embedding Model (optional, format: type:name)")
             split_strategy_input = gr.Radio(choices=["token", "recursive"], label="Split Strategy", value="recursive")
@@ -696,10 +783,10 @@ def launch_interface(share=True):
             query_optimization_model_input = gr.Textbox(label="Query Optimization Model", value="google/flan-t5-base")
             use_reranking_input = gr.Checkbox(label="Use Reranking", value=False)
-        ####
         with gr.Tab("Automation"):
             auto_file_input = gr.File(label="Upload File (Optional)")
             auto_query_input = gr.Textbox(label="Search Query")
             auto_model_types = gr.CheckboxGroup(
                 choices=["HuggingFace", "OpenAI", "Cohere"],
                 label="Model Types to Test"
@@ -724,13 +811,35 @@ def launch_interface(share=True):
             auto_use_query_optimization = gr.Checkbox(label="Test Query Optimization", value=True)
             auto_use_reranking = gr.Checkbox(label="Test Reranking", value=True)
         results_output = gr.Dataframe(label="Results", interactive=False)
         stats_output = gr.Dataframe(label="Statistics", interactive=False)
         plot_output = gr.Plot(label="Visualizations")
         submit_button = gr.Button("Compare Embeddings")
         submit_button.click(
-            fn=compare_embeddings,
             inputs=[
                 file_input, query_input, embedding_models_input, custom_embedding_model_input,
                 split_strategy_input, chunk_size_input, overlap_size_input, custom_separators_input,
@@ -740,7 +849,7 @@ def launch_interface(share=True):
                 custom_tokenizer_vocab_size_input, custom_tokenizer_special_tokens_input,
                 use_query_optimization_input, query_optimization_model_input, use_reranking_input
             ],
-            outputs=[results_output, stats_output, plot_output]
         )
         auto_results_output = gr.Dataframe(label="Automated Test Results", interactive=False)
@@ -751,7 +860,7 @@ def launch_interface(share=True):
         auto_submit_button.click(
             fn=lambda *args: run_automated_tests_and_analyze(*args),
             inputs=[
-                auto_file_input, auto_query_input, auto_model_types, auto_model_names,
                 auto_split_strategies, auto_chunk_sizes, auto_overlap_sizes,
                 auto_vector_store_types, auto_search_types, auto_top_k,
                 auto_optimize_vocab, auto_use_query_optimization, auto_use_reranking

 from langchain.retrievers import MultiQueryRetriever
 from langchain.llms import HuggingFacePipeline
 from transformers import pipeline
+from sklearn.model_selection import ParameterGrid
+from tqdm import tqdm
+import random
 # NLTK Resource Download
 def download_nltk_resources():
 # ... (previous code remains the same)
+# function for automated testing
+def automated_testing(file, query, test_params, expected_result=None):
     all_results = []
     all_stats = []
             params['search_type'],
             query,
             params['top_k'],
+            expected_result,
             params['lang'],
             params['apply_phonetic'],
             params['phonetic_weight']
             reranker = pipeline("text-classification", model="cross-encoder/ms-marco-MiniLM-L-12-v2")
             results_raw = rerank_results(results_raw, query, reranker)
+        stats = calculate_statistics(results_raw, search_time, vector_store, num_tokens, embedding_model, query, params['top_k'], expected_result)
         stats["model"] = f"{params['model_type']} - {params['model_name']}"
         stats.update(params)
     return pd.DataFrame(all_results), pd.DataFrame(all_stats)
 # Function to analyze results and propose best model and settings
 def analyze_results(stats_df):
     metric_weights = {
+        'search_time': -0.3,
         'result_diversity': 0.2,
         'rank_correlation': 0.3,
+        'silhouette_score': 0.2,
+        'contains_expected': 0.5,  # High weight for containing the expected result
+        'expected_result_rank': -0.4  # Lower rank (closer to 1) is better
     }
     for metric in metric_weights.keys():
         stats_df[metric] = pd.to_numeric(stats_df[metric], errors='coerce')
     stats_df['weighted_score'] = sum(
         stats_df[metric].fillna(0) * weight
         for metric, weight in metric_weights.items()
     )
     best_config = stats_df.loc[stats_df['weighted_score'].idxmax()]
     recommendations = {
         'best_model': f"{best_config['model_type']} - {best_config['model_name']}",
         'best_settings': {
             'top_k': int(best_config['top_k']),
             'optimize_vocab': bool(best_config['optimize_vocab']),
             'use_query_optimization': bool(best_config['use_query_optimization']),
+            'use_reranking': bool(best_config['use_reranking']),
+            'lang': best_config['lang'],
+            'apply_preprocessing': bool(best_config['apply_preprocessing']),
+            'apply_phonetic': bool(best_config['apply_phonetic']),
+            'phonetic_weight': float(best_config['phonetic_weight'])
         },
         'performance_summary': {
             'search_time': float(best_config['search_time']),
             'result_diversity': float(best_config['result_diversity']),
             'rank_correlation': float(best_config['rank_correlation']),
+            'silhouette_score': float(best_config['silhouette_score']),
+            'contains_expected': bool(best_config['contains_expected']),
+            'expected_result_rank': int(best_config['expected_result_rank'])
         }
     }
     return recommendations
     ####
+def get_llm_suggested_settings(file, num_chunks=5):
+    chunks, _, _ = process_files(file.name if file else None, 'HuggingFace', 'paraphrase-miniLM', 'recursive', 500, 50)
+    # Select a few random chunks
+    sample_chunks = random.sample(chunks, min(num_chunks, len(chunks)))
+    # Prepare the prompt
+    prompt = f"""Given the following text chunks from a document, suggest optimal settings for an embedding-based search system. The settings should include:
+1. Embedding model type and name
+2. Split strategy (token or recursive)
+3. Chunk size
+4. Overlap size
+5. Vector store type (FAISS or Chroma)
+6. Search type (similarity, mmr, or custom)
+7. Top K results to retrieve
+8. Whether to apply preprocessing
+9. Whether to optimize vocabulary
+10. Whether to apply phonetic matching
+Text chunks:
+{' '.join(sample_chunks)}
+Provide your suggestions in a Python dictionary format."""
+    # Use a HuggingFace model for text generation
+    llm = HuggingFacePipeline.from_model_id(
+        model_id="google/flan-t5-large",
+        task="text2text-generation",
+        model_kwargs={"temperature": 0.7, "max_length": 512},
+    )
+    # Generate suggestions
+    suggested_settings = llm(prompt)
+    # Parse the generated text to extract the dictionary
+    # Note: This assumes the LLM generates a valid Python dictionary. In practice, you might need more robust parsing.
+    try:
+        settings_dict = eval(suggested_settings)
+        return {
+            "embedding_models": f"{settings_dict['embedding_model_type']}:{settings_dict['embedding_model_name']}",
+            "split_strategy": settings_dict["split_strategy"],
+            "chunk_size": settings_dict["chunk_size"],
+            "overlap_size": settings_dict["overlap_size"],
+            "vector_store_type": settings_dict["vector_store_type"],
+            "search_type": settings_dict["search_type"],
+            "top_k": settings_dict["top_k"],
+            "apply_preprocessing": settings_dict["apply_preprocessing"],
+            "optimize_vocab": settings_dict["optimize_vocabulary"],
+            "apply_phonetic": settings_dict["apply_phonetic_matching"],
+            "phonetic_weight": 0.3  # Default value, as it's not in the LLM suggestions
+        }
+    except:
+        return {"error": "Failed to parse LLM suggestions"}
+    return settings_dict
+def update_inputs_with_llm_suggestions(suggestions):
+    if "error" in suggestions:
+        return [gr.update() for _ in range(11)]  # Return no updates if there's an error
+    return [
+        gr.update(value=[suggestions["embedding_models"]]),  # embedding_models_input
+        gr.update(value=suggestions["split_strategy"]),      # split_strategy_input
+        gr.update(value=suggestions["chunk_size"]),          # chunk_size_input
+        gr.update(value=suggestions["overlap_size"]),        # overlap_size_input
+        gr.update(value=suggestions["vector_store_type"]),   # vector_store_type_input
+        gr.update(value=suggestions["search_type"]),         # search_type_input
+        gr.update(value=suggestions["top_k"]),               # top_k_input
+        gr.update(value=suggestions["apply_preprocessing"]), # apply_preprocessing_input
+        gr.update(value=suggestions["optimize_vocab"]),      # optimize_vocab_input
+        gr.update(value=suggestions["apply_phonetic"]),      # apply_phonetic_input
+        gr.update(value=suggestions["phonetic_weight"])      # phonetic_weight_input
+    ]
 # Gradio Interface
 def launch_interface(share=True):
     with gr.Blocks() as iface:
                 label="Embedding Models"
             )
             top_k_input = gr.Slider(1, 10, step=1, value=5, label="Top K")
         with gr.Tab("Advanced"):
             custom_embedding_model_input = gr.Textbox(label="Custom Embedding Model (optional, format: type:name)")
             split_strategy_input = gr.Radio(choices=["token", "recursive"], label="Split Strategy", value="recursive")
             query_optimization_model_input = gr.Textbox(label="Query Optimization Model", value="google/flan-t5-base")
             use_reranking_input = gr.Checkbox(label="Use Reranking", value=False)
         with gr.Tab("Automation"):
             auto_file_input = gr.File(label="Upload File (Optional)")
             auto_query_input = gr.Textbox(label="Search Query")
+            auto_expected_result_input = gr.Textbox(label="Expected Result (Optional)")
             auto_model_types = gr.CheckboxGroup(
                 choices=["HuggingFace", "OpenAI", "Cohere"],
                 label="Model Types to Test"
             auto_use_query_optimization = gr.Checkbox(label="Test Query Optimization", value=True)
             auto_use_reranking = gr.Checkbox(label="Test Reranking", value=True)
+        with gr.Tab("LLM Suggestions"):
+            llm_file_input = gr.File(label="Upload File for LLM Suggestions")
+            llm_num_chunks = gr.Slider(1, 10, step=1, value=5, label="Number of Sample Chunks")
+            llm_suggest_button = gr.Button("Get LLM Suggestions")
+            llm_suggestions_output = gr.JSON(label="LLM-suggested Settings")
+        llm_suggest_button.click(
+            fn=get_llm_suggested_settings,
+            inputs=[llm_file_input, llm_num_chunks],
+            outputs=[llm_suggestions_output]
+        ).then(
+            fn=update_inputs_with_llm_suggestions,
+            inputs=[llm_suggestions_output],
+            outputs=[
+                embedding_models_input, split_strategy_input, chunk_size_input,
+                overlap_size_input, vector_store_type_input, search_type_input,
+                top_k_input, apply_preprocessing_input, optimize_vocab_input,
+                apply_phonetic_input, phonetic_weight_input
+            ]
+        )
         results_output = gr.Dataframe(label="Results", interactive=False)
         stats_output = gr.Dataframe(label="Statistics", interactive=False)
         plot_output = gr.Plot(label="Visualizations")
+        best_settings_output = gr.JSON(label="Best Settings")
         submit_button = gr.Button("Compare Embeddings")
         submit_button.click(
+            fn=lambda *args: compare_and_show_best(*args),
             inputs=[
                 file_input, query_input, embedding_models_input, custom_embedding_model_input,
                 split_strategy_input, chunk_size_input, overlap_size_input, custom_separators_input,
                 custom_tokenizer_vocab_size_input, custom_tokenizer_special_tokens_input,
                 use_query_optimization_input, query_optimization_model_input, use_reranking_input
             ],
+            outputs=[results_output, stats_output, plot_output, best_settings_output]
         )
         auto_results_output = gr.Dataframe(label="Automated Test Results", interactive=False)
         auto_submit_button.click(
             fn=lambda *args: run_automated_tests_and_analyze(*args),
             inputs=[
+                auto_file_input, auto_query_input, auto_expected_result_input, auto_model_types, auto_model_names,
                 auto_split_strategies, auto_chunk_sizes, auto_overlap_sizes,
                 auto_vector_store_types, auto_search_types, auto_top_k,
                 auto_optimize_vocab, auto_use_query_optimization, auto_use_reranking