Update app.py
Browse files
app.py
CHANGED
@@ -33,6 +33,10 @@ from functools import lru_cache
|
|
33 |
from langchain.retrievers import MultiQueryRetriever
|
34 |
from langchain.llms import HuggingFacePipeline
|
35 |
from transformers import pipeline
|
|
|
|
|
|
|
|
|
36 |
|
37 |
# NLTK Resource Download
|
38 |
def download_nltk_resources():
|
@@ -548,8 +552,8 @@ from tqdm import tqdm
|
|
548 |
|
549 |
# ... (previous code remains the same)
|
550 |
|
551 |
-
#
|
552 |
-
def automated_testing(file, query, test_params):
|
553 |
all_results = []
|
554 |
all_stats = []
|
555 |
|
@@ -587,6 +591,7 @@ def automated_testing(file, query, test_params):
|
|
587 |
params['search_type'],
|
588 |
query,
|
589 |
params['top_k'],
|
|
|
590 |
params['lang'],
|
591 |
params['apply_phonetic'],
|
592 |
params['phonetic_weight']
|
@@ -596,7 +601,7 @@ def automated_testing(file, query, test_params):
|
|
596 |
reranker = pipeline("text-classification", model="cross-encoder/ms-marco-MiniLM-L-12-v2")
|
597 |
results_raw = rerank_results(results_raw, query, reranker)
|
598 |
|
599 |
-
stats = calculate_statistics(results_raw, search_time, vector_store, num_tokens, embedding_model, query, params['top_k'])
|
600 |
stats["model"] = f"{params['model_type']} - {params['model_name']}"
|
601 |
stats.update(params)
|
602 |
|
@@ -605,30 +610,28 @@ def automated_testing(file, query, test_params):
|
|
605 |
|
606 |
return pd.DataFrame(all_results), pd.DataFrame(all_stats)
|
607 |
|
|
|
608 |
# Function to analyze results and propose best model and settings
|
609 |
def analyze_results(stats_df):
|
610 |
-
# Define weights for different metrics (adjust as needed)
|
611 |
metric_weights = {
|
612 |
-
'search_time': -0.3,
|
613 |
'result_diversity': 0.2,
|
614 |
'rank_correlation': 0.3,
|
615 |
-
'silhouette_score': 0.2
|
|
|
|
|
616 |
}
|
617 |
|
618 |
-
# Convert relevant columns to numeric type
|
619 |
for metric in metric_weights.keys():
|
620 |
stats_df[metric] = pd.to_numeric(stats_df[metric], errors='coerce')
|
621 |
|
622 |
-
# Calculate weighted score for each configuration
|
623 |
stats_df['weighted_score'] = sum(
|
624 |
stats_df[metric].fillna(0) * weight
|
625 |
for metric, weight in metric_weights.items()
|
626 |
)
|
627 |
|
628 |
-
# Get the best configuration
|
629 |
best_config = stats_df.loc[stats_df['weighted_score'].idxmax()]
|
630 |
|
631 |
-
# Generate recommendations
|
632 |
recommendations = {
|
633 |
'best_model': f"{best_config['model_type']} - {best_config['model_name']}",
|
634 |
'best_settings': {
|
@@ -640,19 +643,103 @@ def analyze_results(stats_df):
|
|
640 |
'top_k': int(best_config['top_k']),
|
641 |
'optimize_vocab': bool(best_config['optimize_vocab']),
|
642 |
'use_query_optimization': bool(best_config['use_query_optimization']),
|
643 |
-
'use_reranking': bool(best_config['use_reranking'])
|
|
|
|
|
|
|
|
|
644 |
},
|
645 |
'performance_summary': {
|
646 |
'search_time': float(best_config['search_time']),
|
647 |
'result_diversity': float(best_config['result_diversity']),
|
648 |
'rank_correlation': float(best_config['rank_correlation']),
|
649 |
-
'silhouette_score': float(best_config['silhouette_score'])
|
|
|
|
|
650 |
}
|
651 |
}
|
652 |
|
653 |
return recommendations
|
|
|
654 |
####
|
655 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
656 |
# Gradio Interface
|
657 |
def launch_interface(share=True):
|
658 |
with gr.Blocks() as iface:
|
@@ -672,7 +759,7 @@ def launch_interface(share=True):
|
|
672 |
label="Embedding Models"
|
673 |
)
|
674 |
top_k_input = gr.Slider(1, 10, step=1, value=5, label="Top K")
|
675 |
-
|
676 |
with gr.Tab("Advanced"):
|
677 |
custom_embedding_model_input = gr.Textbox(label="Custom Embedding Model (optional, format: type:name)")
|
678 |
split_strategy_input = gr.Radio(choices=["token", "recursive"], label="Split Strategy", value="recursive")
|
@@ -696,10 +783,10 @@ def launch_interface(share=True):
|
|
696 |
query_optimization_model_input = gr.Textbox(label="Query Optimization Model", value="google/flan-t5-base")
|
697 |
use_reranking_input = gr.Checkbox(label="Use Reranking", value=False)
|
698 |
|
699 |
-
####
|
700 |
with gr.Tab("Automation"):
|
701 |
auto_file_input = gr.File(label="Upload File (Optional)")
|
702 |
auto_query_input = gr.Textbox(label="Search Query")
|
|
|
703 |
auto_model_types = gr.CheckboxGroup(
|
704 |
choices=["HuggingFace", "OpenAI", "Cohere"],
|
705 |
label="Model Types to Test"
|
@@ -724,13 +811,35 @@ def launch_interface(share=True):
|
|
724 |
auto_use_query_optimization = gr.Checkbox(label="Test Query Optimization", value=True)
|
725 |
auto_use_reranking = gr.Checkbox(label="Test Reranking", value=True)
|
726 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
727 |
results_output = gr.Dataframe(label="Results", interactive=False)
|
728 |
stats_output = gr.Dataframe(label="Statistics", interactive=False)
|
729 |
plot_output = gr.Plot(label="Visualizations")
|
|
|
730 |
|
731 |
submit_button = gr.Button("Compare Embeddings")
|
732 |
submit_button.click(
|
733 |
-
fn=
|
734 |
inputs=[
|
735 |
file_input, query_input, embedding_models_input, custom_embedding_model_input,
|
736 |
split_strategy_input, chunk_size_input, overlap_size_input, custom_separators_input,
|
@@ -740,7 +849,7 @@ def launch_interface(share=True):
|
|
740 |
custom_tokenizer_vocab_size_input, custom_tokenizer_special_tokens_input,
|
741 |
use_query_optimization_input, query_optimization_model_input, use_reranking_input
|
742 |
],
|
743 |
-
outputs=[results_output, stats_output, plot_output]
|
744 |
)
|
745 |
|
746 |
auto_results_output = gr.Dataframe(label="Automated Test Results", interactive=False)
|
@@ -751,7 +860,7 @@ def launch_interface(share=True):
|
|
751 |
auto_submit_button.click(
|
752 |
fn=lambda *args: run_automated_tests_and_analyze(*args),
|
753 |
inputs=[
|
754 |
-
auto_file_input, auto_query_input, auto_model_types, auto_model_names,
|
755 |
auto_split_strategies, auto_chunk_sizes, auto_overlap_sizes,
|
756 |
auto_vector_store_types, auto_search_types, auto_top_k,
|
757 |
auto_optimize_vocab, auto_use_query_optimization, auto_use_reranking
|
|
|
33 |
from langchain.retrievers import MultiQueryRetriever
|
34 |
from langchain.llms import HuggingFacePipeline
|
35 |
from transformers import pipeline
|
36 |
+
from sklearn.model_selection import ParameterGrid
|
37 |
+
from tqdm import tqdm
|
38 |
+
import random
|
39 |
+
|
40 |
|
41 |
# NLTK Resource Download
|
42 |
def download_nltk_resources():
|
|
|
552 |
|
553 |
# ... (previous code remains the same)
|
554 |
|
555 |
+
# function for automated testing
|
556 |
+
def automated_testing(file, query, test_params, expected_result=None):
|
557 |
all_results = []
|
558 |
all_stats = []
|
559 |
|
|
|
591 |
params['search_type'],
|
592 |
query,
|
593 |
params['top_k'],
|
594 |
+
expected_result,
|
595 |
params['lang'],
|
596 |
params['apply_phonetic'],
|
597 |
params['phonetic_weight']
|
|
|
601 |
reranker = pipeline("text-classification", model="cross-encoder/ms-marco-MiniLM-L-12-v2")
|
602 |
results_raw = rerank_results(results_raw, query, reranker)
|
603 |
|
604 |
+
stats = calculate_statistics(results_raw, search_time, vector_store, num_tokens, embedding_model, query, params['top_k'], expected_result)
|
605 |
stats["model"] = f"{params['model_type']} - {params['model_name']}"
|
606 |
stats.update(params)
|
607 |
|
|
|
610 |
|
611 |
return pd.DataFrame(all_results), pd.DataFrame(all_stats)
|
612 |
|
613 |
+
|
614 |
# Function to analyze results and propose best model and settings
|
615 |
def analyze_results(stats_df):
|
|
|
616 |
metric_weights = {
|
617 |
+
'search_time': -0.3,
|
618 |
'result_diversity': 0.2,
|
619 |
'rank_correlation': 0.3,
|
620 |
+
'silhouette_score': 0.2,
|
621 |
+
'contains_expected': 0.5, # High weight for containing the expected result
|
622 |
+
'expected_result_rank': -0.4 # Lower rank (closer to 1) is better
|
623 |
}
|
624 |
|
|
|
625 |
for metric in metric_weights.keys():
|
626 |
stats_df[metric] = pd.to_numeric(stats_df[metric], errors='coerce')
|
627 |
|
|
|
628 |
stats_df['weighted_score'] = sum(
|
629 |
stats_df[metric].fillna(0) * weight
|
630 |
for metric, weight in metric_weights.items()
|
631 |
)
|
632 |
|
|
|
633 |
best_config = stats_df.loc[stats_df['weighted_score'].idxmax()]
|
634 |
|
|
|
635 |
recommendations = {
|
636 |
'best_model': f"{best_config['model_type']} - {best_config['model_name']}",
|
637 |
'best_settings': {
|
|
|
643 |
'top_k': int(best_config['top_k']),
|
644 |
'optimize_vocab': bool(best_config['optimize_vocab']),
|
645 |
'use_query_optimization': bool(best_config['use_query_optimization']),
|
646 |
+
'use_reranking': bool(best_config['use_reranking']),
|
647 |
+
'lang': best_config['lang'],
|
648 |
+
'apply_preprocessing': bool(best_config['apply_preprocessing']),
|
649 |
+
'apply_phonetic': bool(best_config['apply_phonetic']),
|
650 |
+
'phonetic_weight': float(best_config['phonetic_weight'])
|
651 |
},
|
652 |
'performance_summary': {
|
653 |
'search_time': float(best_config['search_time']),
|
654 |
'result_diversity': float(best_config['result_diversity']),
|
655 |
'rank_correlation': float(best_config['rank_correlation']),
|
656 |
+
'silhouette_score': float(best_config['silhouette_score']),
|
657 |
+
'contains_expected': bool(best_config['contains_expected']),
|
658 |
+
'expected_result_rank': int(best_config['expected_result_rank'])
|
659 |
}
|
660 |
}
|
661 |
|
662 |
return recommendations
|
663 |
+
|
664 |
####
|
665 |
|
666 |
+
def get_llm_suggested_settings(file, num_chunks=5):
|
667 |
+
chunks, _, _ = process_files(file.name if file else None, 'HuggingFace', 'paraphrase-miniLM', 'recursive', 500, 50)
|
668 |
+
|
669 |
+
# Select a few random chunks
|
670 |
+
sample_chunks = random.sample(chunks, min(num_chunks, len(chunks)))
|
671 |
+
|
672 |
+
# Prepare the prompt
|
673 |
+
prompt = f"""Given the following text chunks from a document, suggest optimal settings for an embedding-based search system. The settings should include:
|
674 |
+
|
675 |
+
1. Embedding model type and name
|
676 |
+
2. Split strategy (token or recursive)
|
677 |
+
3. Chunk size
|
678 |
+
4. Overlap size
|
679 |
+
5. Vector store type (FAISS or Chroma)
|
680 |
+
6. Search type (similarity, mmr, or custom)
|
681 |
+
7. Top K results to retrieve
|
682 |
+
8. Whether to apply preprocessing
|
683 |
+
9. Whether to optimize vocabulary
|
684 |
+
10. Whether to apply phonetic matching
|
685 |
+
|
686 |
+
Text chunks:
|
687 |
+
{' '.join(sample_chunks)}
|
688 |
+
|
689 |
+
Provide your suggestions in a Python dictionary format."""
|
690 |
+
|
691 |
+
# Use a HuggingFace model for text generation
|
692 |
+
llm = HuggingFacePipeline.from_model_id(
|
693 |
+
model_id="google/flan-t5-large",
|
694 |
+
task="text2text-generation",
|
695 |
+
model_kwargs={"temperature": 0.7, "max_length": 512},
|
696 |
+
)
|
697 |
+
|
698 |
+
# Generate suggestions
|
699 |
+
suggested_settings = llm(prompt)
|
700 |
+
|
701 |
+
# Parse the generated text to extract the dictionary
|
702 |
+
# Note: This assumes the LLM generates a valid Python dictionary. In practice, you might need more robust parsing.
|
703 |
+
try:
|
704 |
+
settings_dict = eval(suggested_settings)
|
705 |
+
return {
|
706 |
+
"embedding_models": f"{settings_dict['embedding_model_type']}:{settings_dict['embedding_model_name']}",
|
707 |
+
"split_strategy": settings_dict["split_strategy"],
|
708 |
+
"chunk_size": settings_dict["chunk_size"],
|
709 |
+
"overlap_size": settings_dict["overlap_size"],
|
710 |
+
"vector_store_type": settings_dict["vector_store_type"],
|
711 |
+
"search_type": settings_dict["search_type"],
|
712 |
+
"top_k": settings_dict["top_k"],
|
713 |
+
"apply_preprocessing": settings_dict["apply_preprocessing"],
|
714 |
+
"optimize_vocab": settings_dict["optimize_vocabulary"],
|
715 |
+
"apply_phonetic": settings_dict["apply_phonetic_matching"],
|
716 |
+
"phonetic_weight": 0.3 # Default value, as it's not in the LLM suggestions
|
717 |
+
}
|
718 |
+
except:
|
719 |
+
return {"error": "Failed to parse LLM suggestions"}
|
720 |
+
|
721 |
+
return settings_dict
|
722 |
+
|
723 |
+
|
724 |
+
def update_inputs_with_llm_suggestions(suggestions):
|
725 |
+
if "error" in suggestions:
|
726 |
+
return [gr.update() for _ in range(11)] # Return no updates if there's an error
|
727 |
+
|
728 |
+
return [
|
729 |
+
gr.update(value=[suggestions["embedding_models"]]), # embedding_models_input
|
730 |
+
gr.update(value=suggestions["split_strategy"]), # split_strategy_input
|
731 |
+
gr.update(value=suggestions["chunk_size"]), # chunk_size_input
|
732 |
+
gr.update(value=suggestions["overlap_size"]), # overlap_size_input
|
733 |
+
gr.update(value=suggestions["vector_store_type"]), # vector_store_type_input
|
734 |
+
gr.update(value=suggestions["search_type"]), # search_type_input
|
735 |
+
gr.update(value=suggestions["top_k"]), # top_k_input
|
736 |
+
gr.update(value=suggestions["apply_preprocessing"]), # apply_preprocessing_input
|
737 |
+
gr.update(value=suggestions["optimize_vocab"]), # optimize_vocab_input
|
738 |
+
gr.update(value=suggestions["apply_phonetic"]), # apply_phonetic_input
|
739 |
+
gr.update(value=suggestions["phonetic_weight"]) # phonetic_weight_input
|
740 |
+
]
|
741 |
+
|
742 |
+
|
743 |
# Gradio Interface
|
744 |
def launch_interface(share=True):
|
745 |
with gr.Blocks() as iface:
|
|
|
759 |
label="Embedding Models"
|
760 |
)
|
761 |
top_k_input = gr.Slider(1, 10, step=1, value=5, label="Top K")
|
762 |
+
|
763 |
with gr.Tab("Advanced"):
|
764 |
custom_embedding_model_input = gr.Textbox(label="Custom Embedding Model (optional, format: type:name)")
|
765 |
split_strategy_input = gr.Radio(choices=["token", "recursive"], label="Split Strategy", value="recursive")
|
|
|
783 |
query_optimization_model_input = gr.Textbox(label="Query Optimization Model", value="google/flan-t5-base")
|
784 |
use_reranking_input = gr.Checkbox(label="Use Reranking", value=False)
|
785 |
|
|
|
786 |
with gr.Tab("Automation"):
|
787 |
auto_file_input = gr.File(label="Upload File (Optional)")
|
788 |
auto_query_input = gr.Textbox(label="Search Query")
|
789 |
+
auto_expected_result_input = gr.Textbox(label="Expected Result (Optional)")
|
790 |
auto_model_types = gr.CheckboxGroup(
|
791 |
choices=["HuggingFace", "OpenAI", "Cohere"],
|
792 |
label="Model Types to Test"
|
|
|
811 |
auto_use_query_optimization = gr.Checkbox(label="Test Query Optimization", value=True)
|
812 |
auto_use_reranking = gr.Checkbox(label="Test Reranking", value=True)
|
813 |
|
814 |
+
with gr.Tab("LLM Suggestions"):
|
815 |
+
llm_file_input = gr.File(label="Upload File for LLM Suggestions")
|
816 |
+
llm_num_chunks = gr.Slider(1, 10, step=1, value=5, label="Number of Sample Chunks")
|
817 |
+
llm_suggest_button = gr.Button("Get LLM Suggestions")
|
818 |
+
llm_suggestions_output = gr.JSON(label="LLM-suggested Settings")
|
819 |
+
|
820 |
+
llm_suggest_button.click(
|
821 |
+
fn=get_llm_suggested_settings,
|
822 |
+
inputs=[llm_file_input, llm_num_chunks],
|
823 |
+
outputs=[llm_suggestions_output]
|
824 |
+
).then(
|
825 |
+
fn=update_inputs_with_llm_suggestions,
|
826 |
+
inputs=[llm_suggestions_output],
|
827 |
+
outputs=[
|
828 |
+
embedding_models_input, split_strategy_input, chunk_size_input,
|
829 |
+
overlap_size_input, vector_store_type_input, search_type_input,
|
830 |
+
top_k_input, apply_preprocessing_input, optimize_vocab_input,
|
831 |
+
apply_phonetic_input, phonetic_weight_input
|
832 |
+
]
|
833 |
+
)
|
834 |
+
|
835 |
results_output = gr.Dataframe(label="Results", interactive=False)
|
836 |
stats_output = gr.Dataframe(label="Statistics", interactive=False)
|
837 |
plot_output = gr.Plot(label="Visualizations")
|
838 |
+
best_settings_output = gr.JSON(label="Best Settings")
|
839 |
|
840 |
submit_button = gr.Button("Compare Embeddings")
|
841 |
submit_button.click(
|
842 |
+
fn=lambda *args: compare_and_show_best(*args),
|
843 |
inputs=[
|
844 |
file_input, query_input, embedding_models_input, custom_embedding_model_input,
|
845 |
split_strategy_input, chunk_size_input, overlap_size_input, custom_separators_input,
|
|
|
849 |
custom_tokenizer_vocab_size_input, custom_tokenizer_special_tokens_input,
|
850 |
use_query_optimization_input, query_optimization_model_input, use_reranking_input
|
851 |
],
|
852 |
+
outputs=[results_output, stats_output, plot_output, best_settings_output]
|
853 |
)
|
854 |
|
855 |
auto_results_output = gr.Dataframe(label="Automated Test Results", interactive=False)
|
|
|
860 |
auto_submit_button.click(
|
861 |
fn=lambda *args: run_automated_tests_and_analyze(*args),
|
862 |
inputs=[
|
863 |
+
auto_file_input, auto_query_input, auto_expected_result_input, auto_model_types, auto_model_names,
|
864 |
auto_split_strategies, auto_chunk_sizes, auto_overlap_sizes,
|
865 |
auto_vector_store_types, auto_search_types, auto_top_k,
|
866 |
auto_optimize_vocab, auto_use_query_optimization, auto_use_reranking
|