More_Advanced_Embeddings_Comparator

Running

App Files Files Community

Chris4K commited on Oct 20

Commit

6fd2acf

•

1 Parent(s): 60de941

Update app.py

Browse files

Files changed (1) hide show

app.py +178 -0

app.py CHANGED Viewed

@@ -531,6 +531,111 @@ def format_results(results, stats):
         formatted_results.append(result)
     return formatted_results
 # Gradio Interface
 def launch_interface(share=True):
     with gr.Blocks() as iface:
@@ -592,6 +697,51 @@ def launch_interface(share=True):
             outputs=[results_output, stats_output, plot_output]
         )
     tutorial_md = """
     # Advanced Embedding Comparison Tool Tutorial
@@ -618,5 +768,33 @@ def launch_interface(share=True):
     iface.launch(share=share)
 if __name__ == "__main__":
     launch_interface()

         formatted_results.append(result)
     return formatted_results
+#####
+from sklearn.model_selection import ParameterGrid
+from tqdm import tqdm
+# ... (previous code remains the same)
+# New function for automated testing
+def automated_testing(file, query, test_params):
+    all_results = []
+    all_stats = []
+    param_grid = ParameterGrid(test_params)
+    for params in tqdm(param_grid, desc="Running tests"):
+        chunks, embedding_model, num_tokens = process_files(
+            file.name if file else None,
+            params['model_type'],
+            params['model_name'],
+            params['split_strategy'],
+            params['chunk_size'],
+            params['overlap_size'],
+            params.get('custom_separators', None),
+            params['lang'],
+            params['apply_preprocessing'],
+            params.get('custom_tokenizer_file', None),
+            params.get('custom_tokenizer_model', None),
+            params.get('custom_tokenizer_vocab_size', 10000),
+            params.get('custom_tokenizer_special_tokens', None)
+        )
+        if params['optimize_vocab']:
+            tokenizer, optimized_chunks = optimize_vocabulary(chunks)
+            chunks = optimized_chunks
+        if params['use_query_optimization']:
+            optimized_queries = optimize_query(query, params['query_optimization_model'])
+            query = " ".join(optimized_queries)
+        results, search_time, vector_store, results_raw = search_embeddings(
+            chunks,
+            embedding_model,
+            params['vector_store_type'],
+            params['search_type'],
+            query,
+            params['top_k'],
+            params['lang'],
+            params['apply_phonetic'],
+            params['phonetic_weight']
+        )
+        if params['use_reranking']:
+            reranker = pipeline("text-classification", model="cross-encoder/ms-marco-MiniLM-L-12-v2")
+            results_raw = rerank_results(results_raw, query, reranker)
+        stats = calculate_statistics(results_raw, search_time, vector_store, num_tokens, embedding_model, query, params['top_k'])
+        stats["model"] = f"{params['model_type']} - {params['model_name']}"
+        stats.update(params)
+        all_results.extend(format_results(results_raw, stats))
+        all_stats.append(stats)
+    return pd.DataFrame(all_results), pd.DataFrame(all_stats)
+# Function to analyze results and propose best model and settings
+def analyze_results(stats_df):
+    # Define weights for different metrics (adjust as needed)
+    metric_weights = {
+        'search_time': -0.3,  # Lower is better
+        'result_diversity': 0.2,
+        'rank_correlation': 0.3,
+        'silhouette_score': 0.2
+    }
+    # Calculate weighted score for each configuration
+    stats_df['weighted_score'] = sum(stats_df[metric] * weight for metric, weight in metric_weights.items())
+    # Get the best configuration
+    best_config = stats_df.loc[stats_df['weighted_score'].idxmax()]
+    # Generate recommendations
+    recommendations = {
+        'best_model': f"{best_config['model_type']} - {best_config['model_name']}",
+        'best_settings': {
+            'split_strategy': best_config['split_strategy'],
+            'chunk_size': best_config['chunk_size'],
+            'overlap_size': best_config['overlap_size'],
+            'vector_store_type': best_config['vector_store_type'],
+            'search_type': best_config['search_type'],
+            'top_k': best_config['top_k'],
+            'optimize_vocab': best_config['optimize_vocab'],
+            'use_query_optimization': best_config['use_query_optimization'],
+            'use_reranking': best_config['use_reranking']
+        },
+        'performance_summary': {
+            'search_time': best_config['search_time'],
+            'result_diversity': best_config['result_diversity'],
+            'rank_correlation': best_config['rank_correlation'],
+            'silhouette_score': best_config['silhouette_score']
+        }
+    }
+    return recommendations
+####
 # Gradio Interface
 def launch_interface(share=True):
     with gr.Blocks() as iface:
             outputs=[results_output, stats_output, plot_output]
         )
+        ####
+                with gr.Tab("Automated"):
+            auto_file_input = gr.File(label="Upload File (Optional)")
+            auto_query_input = gr.Textbox(label="Search Query")
+            auto_model_types = gr.CheckboxGroup(
+                choices=["HuggingFace", "OpenAI", "Cohere"],
+                label="Model Types to Test"
+            )
+            auto_model_names = gr.TextArea(label="Model Names to Test (comma-separated)")
+            auto_split_strategies = gr.CheckboxGroup(
+                choices=["token", "recursive"],
+                label="Split Strategies to Test"
+            )
+            auto_chunk_sizes = gr.TextArea(label="Chunk Sizes to Test (comma-separated)")
+            auto_overlap_sizes = gr.TextArea(label="Overlap Sizes to Test (comma-separated)")
+            auto_vector_store_types = gr.CheckboxGroup(
+                choices=["FAISS", "Chroma"],
+                label="Vector Store Types to Test"
+            )
+            auto_search_types = gr.CheckboxGroup(
+                choices=["similarity", "mmr", "custom"],
+                label="Search Types to Test"
+            )
+            auto_top_k = gr.TextArea(label="Top K Values to Test (comma-separated)")
+            auto_optimize_vocab = gr.Checkbox(label="Test Vocabulary Optimization", value=True)
+            auto_use_query_optimization = gr.Checkbox(label="Test Query Optimization", value=True)
+            auto_use_reranking = gr.Checkbox(label="Test Reranking", value=True)
+        auto_results_output = gr.Dataframe(label="Automated Test Results", interactive=False)
+        auto_stats_output = gr.Dataframe(label="Automated Test Statistics", interactive=False)
+        recommendations_output = gr.JSON(label="Recommendations")
+        auto_submit_button = gr.Button("Run Automated Tests")
+        auto_submit_button.click(
+            fn=lambda *args: run_automated_tests_and_analyze(*args),
+            inputs=[
+                auto_file_input, auto_query_input, auto_model_types, auto_model_names,
+                auto_split_strategies, auto_chunk_sizes, auto_overlap_sizes,
+                auto_vector_store_types, auto_search_types, auto_top_k,
+                auto_optimize_vocab, auto_use_query_optimization, auto_use_reranking
+            ],
+            outputs=[auto_results_output, auto_stats_output, recommendations_output]
+        )
+        ###
     tutorial_md = """
     # Advanced Embedding Comparison Tool Tutorial
     iface.launch(share=share)
+def run_automated_tests_and_analyze(*args):
+    file, query, model_types, model_names, split_strategies, chunk_sizes, overlap_sizes, \
+    vector_store_types, search_types, top_k_values, optimize_vocab, use_query_optimization, use_reranking = args
+    test_params = {
+        'model_type': model_types,
+        'model_name': [name.strip() for name in model_names.split(',')],
+        'split_strategy': split_strategies,
+        'chunk_size': [int(size.strip()) for size in chunk_sizes.split(',')],
+        'overlap_size': [int(size.strip()) for size in overlap_sizes.split(',')],
+        'vector_store_type': vector_store_types,
+        'search_type': search_types,
+        'top_k': [int(k.strip()) for k in top_k_values.split(',')],
+        'lang': ['german'],  # You can expand this if needed
+        'apply_preprocessing': [True],
+        'optimize_vocab': [optimize_vocab],
+        'apply_phonetic': [True],
+        'phonetic_weight': [0.3],
+        'use_query_optimization': [use_query_optimization],
+        'query_optimization_model': ['google/flan-t5-base'],
+        'use_reranking': [use_reranking]
+    }
+    results_df, stats_df = automated_testing(file, query, test_params)
+    recommendations = analyze_results(stats_df)
+    return results_df, stats_df, recommendations
 if __name__ == "__main__":
     launch_interface()