Chris4K commited on
Commit
77d7782
1 Parent(s): 7157008

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +297 -117
app.py CHANGED
@@ -74,6 +74,8 @@ FILES_DIR = './files'
74
  # Model Management
75
  class ModelManager:
76
  def __init__(self):
 
 
77
  self.models = {
78
  'HuggingFace': {
79
  'e5-base-de': "danielheinz/e5-base-sts-en-de",
@@ -90,6 +92,28 @@ class ModelManager:
90
  }
91
  }
92
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  def add_model(self, provider, name, model_path):
94
  if provider not in self.models:
95
  self.models[provider] = {}
@@ -197,7 +221,7 @@ class FileHandler:
197
  def simple_tokenize(text):
198
  return text.split()
199
 
200
- def preprocess_text(text, lang='german', apply_preprocessing=True):
201
  if not apply_preprocessing:
202
  return text
203
 
@@ -225,7 +249,7 @@ def preprocess_text(text, lang='german', apply_preprocessing=True):
225
 
226
  return ' '.join(tokens)
227
 
228
- def phonetic_match(text, query, method='levenshtein_distance', apply_phonetic=True):
229
  if not apply_phonetic:
230
  return 0
231
  if method == 'levenshtein_distance':
@@ -390,7 +414,7 @@ def _create_vector_store(vector_store_type, chunks_tuple, embedding_model):
390
 
391
 
392
  # Main Processing Functions
393
- def process_files(file_path, model_type, model_name, split_strategy, chunk_size, overlap_size, custom_separators, lang='german', apply_preprocessing=True, custom_tokenizer_file=None, custom_tokenizer_model=None, custom_tokenizer_vocab_size=10000, custom_tokenizer_special_tokens=None):
394
  if file_path:
395
  text = FileHandler.extract_text(file_path)
396
  else:
@@ -412,7 +436,7 @@ def process_files(file_path, model_type, model_name, split_strategy, chunk_size,
412
 
413
  return chunks, embedding_model, len(text.split())
414
 
415
- def search_embeddings(chunks, embedding_model, vector_store_type, search_type, query, top_k, expected_result=None, lang='german', apply_phonetic=True, phonetic_weight=0.3):
416
  preprocessed_query = preprocess_text(query, lang) if apply_phonetic else query
417
 
418
  vector_store = get_vector_store(vector_store_type, chunks, embedding_model)
@@ -421,6 +445,7 @@ def search_embeddings(chunks, embedding_model, vector_store_type, search_type, q
421
  start_time = time.time()
422
  results = retriever.invoke(preprocessed_query)
423
 
 
424
  def score_result(doc):
425
  base_score = vector_store.similarity_search_with_score(doc.page_content, k=1)[0][1]
426
 
@@ -452,68 +477,83 @@ def search_embeddings(chunks, embedding_model, vector_store_type, search_type, q
452
 
453
  return results_df, end_time - start_time, vector_store, results
454
 
455
-
456
-
457
- # Evaluation Metrics
458
- def calculate_statistics(results, search_time, vector_store, num_tokens, embedding_model, query, top_k, expected_result=None):
459
- stats = {
460
- "num_results": len(results),
461
- "avg_content_length": np.mean([len(doc.page_content) for doc in results]) if results else 0,
462
- "min_content_length": min([len(doc.page_content) for doc in results]) if results else 0,
463
- "max_content_length": max([len(doc.page_content) for doc in results]) if results else 0,
464
- "search_time": search_time,
465
- "num_tokens": num_tokens,
466
- "embedding_dimension": len(embedding_model.embed_query(query)),
467
- "top_k": top_k,
468
- }
469
-
470
- # Safely get vector store size
471
- try:
472
- if hasattr(vector_store, '_index'):
473
- stats["vector_store_size"] = vector_store._index.ntotal
474
- elif hasattr(vector_store, '_collection'):
475
- stats["vector_store_size"] = len(vector_store._collection.get())
476
- else:
477
- stats["vector_store_size"] = "N/A"
478
- except:
479
- stats["vector_store_size"] = "N/A"
480
-
481
- # Safely get document count
482
- try:
483
- if hasattr(vector_store, 'docstore'):
484
- stats["num_documents"] = len(vector_store.docstore._dict)
485
- elif hasattr(vector_store, '_collection'):
486
- stats["num_documents"] = len(vector_store._collection.get())
487
- else:
488
- stats["num_documents"] = len(results)
489
- except:
490
- stats["num_documents"] = len(results)
491
-
492
-
493
- if expected_result:
494
- stats["contains_expected"] = any(expected_result in doc.page_content for doc in results)
495
- stats["expected_result_rank"] = next((i for i, doc in enumerate(results) if expected_result in doc.page_content), -1) + 1
496
-
497
- if len(results) > 1000:
498
- embeddings = [embedding_model.embed_query(doc.page_content) for doc in results]
499
- pairwise_similarities = np.inner(embeddings, embeddings)
500
- stats["result_diversity"] = 1 - np.mean(pairwise_similarities[np.triu_indices(len(embeddings), k=1)])
501
 
502
- if len(embeddings) > 2:
503
- stats["silhouette_score"] = silhouette_score(embeddings, range(len(embeddings)))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
504
  else:
 
505
  stats["silhouette_score"] = "N/A"
506
- else:
507
- stats["result_diversity"] = "N/A"
508
- stats["silhouette_score"] = "N/A"
 
 
 
 
 
 
 
 
 
 
 
 
 
509
 
510
- query_embedding = embedding_model.embed_query(query)
511
- result_embeddings = [embedding_model.embed_query(doc.page_content) for doc in results]
512
- similarities = [np.inner(query_embedding, emb) for emb in result_embeddings]
513
- rank_correlation, _ = spearmanr(similarities, range(len(similarities)))
514
- stats["rank_correlation"] = rank_correlation
 
515
 
516
- return stats
 
 
 
 
 
 
 
 
 
 
 
 
 
517
  # Visualization
518
  def visualize_results(results_df, stats_df):
519
  # Add model column if not present
@@ -688,7 +728,7 @@ def compare_embeddings(file, query, embedding_models, custom_embedding_model, sp
688
 
689
  result_embeddings = [doc.metadata.get('embedding', None) for doc in results_raw]
690
 
691
- stats = calculate_statistics(results_raw, search_time, vector_store, num_tokens, embedding_model, query, top_k, expected_result)
692
  stats["model"] = f"{model_type} - {model_name}"
693
  stats["model_type"] = model_type
694
  stats["model_name"] = model_name
@@ -783,7 +823,7 @@ def automated_testing(file, query, test_params, expected_result=None):
783
  reranker = pipeline("text-classification", model="cross-encoder/ms-marco-MiniLM-L-12-v2")
784
  results_raw = rerank_results(results_raw, query, reranker)
785
 
786
- stats = calculate_statistics(results_raw, search_time, vector_store, num_tokens, embedding_model, query, params['top_k'], expected_result)
787
  stats["model"] = f"{params['model_type']} - {params['model_name']}"
788
  stats["model_type"] = params['model_type']
789
  stats["model_name"] = params['model_name']
@@ -989,28 +1029,55 @@ def launch_interface(share=True):
989
  search_type_input = gr.Radio(choices=["similarity", "mmr", "custom"], label="Search Type", value="similarity")
990
  lang_input = gr.Dropdown(choices=["german", "english", "french"], label="Language", value="german")
991
 
992
- with gr.Tab("Optional"):
993
- apply_preprocessing_input = gr.Checkbox(label="Apply Text Preprocessing", value=True)
994
  optimize_vocab_input = gr.Checkbox(label="Optimize Vocabulary", value=False)
995
- apply_phonetic_input = gr.Checkbox(label="Apply Phonetic Matching", value=True)
996
  phonetic_weight_input = gr.Slider(0, 1, step=0.1, value=0.3, label="Phonetic Matching Weight")
997
  custom_tokenizer_file_input = gr.File(label="Custom Tokenizer File (Optional)")
998
  custom_tokenizer_model_input = gr.Textbox(label="Custom Tokenizer Model (e.g., WordLevel, BPE, Unigram)")
999
  custom_tokenizer_vocab_size_input = gr.Textbox(label="Custom Tokenizer Vocab Size", value="10000")
1000
  custom_tokenizer_special_tokens_input = gr.Textbox(label="Custom Tokenizer Special Tokens (comma-separated)")
1001
  use_query_optimization_input = gr.Checkbox(label="Use Query Optimization", value=False)
1002
- query_optimization_model_input = gr.Textbox(label="Query Optimization Model", value="google/flan-t5-base")
1003
  use_reranking_input = gr.Checkbox(label="Use Reranking", value=False)
1004
 
1005
  with gr.Tab("Automation"):
1006
- auto_file_input = gr.File(label="Upload File (Optional)")
1007
- auto_query_input = gr.Textbox(label="Search Query")
1008
- auto_expected_result_input = gr.Textbox(label="Expected Result (Optional)")
1009
- auto_model_types = gr.CheckboxGroup(
1010
- choices=["HuggingFace", "OpenAI", "Cohere"],
1011
- label="Model Types to Test"
1012
- )
1013
- auto_model_names = gr.TextArea(label="Model Names to Test (comma-separated)")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1014
  auto_split_strategies = gr.CheckboxGroup(
1015
  choices=["token", "recursive"],
1016
  label="Split Strategies to Test"
@@ -1030,6 +1097,36 @@ def launch_interface(share=True):
1030
  auto_use_query_optimization = gr.Checkbox(label="Test Query Optimization", value=True)
1031
  auto_use_reranking = gr.Checkbox(label="Test Reranking", value=True)
1032
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1033
  with gr.Tab("LLM Suggestions"):
1034
  llm_file_input = gr.File(label="Upload File for LLM Suggestions")
1035
  llm_num_chunks = gr.Slider(1, 10, step=1, value=5, label="Number of Sample Chunks")
@@ -1072,22 +1169,6 @@ def launch_interface(share=True):
1072
  outputs=[results_output, stats_output, plot_output, best_settings_output]
1073
  )
1074
 
1075
- auto_results_output = gr.Dataframe(label="Automated Test Results", interactive=False)
1076
- auto_stats_output = gr.Dataframe(label="Automated Test Statistics", interactive=False)
1077
- recommendations_output = gr.JSON(label="Recommendations")
1078
-
1079
- auto_submit_button = gr.Button("Run Automated Tests")
1080
- auto_submit_button.click(
1081
- fn=lambda *args: run_automated_tests_and_analyze(*args),
1082
- inputs=[
1083
- auto_file_input, auto_query_input, auto_expected_result_input, auto_model_types, auto_model_names,
1084
- auto_split_strategies, auto_chunk_sizes, auto_overlap_sizes,
1085
- auto_vector_store_types, auto_search_types, auto_top_k,
1086
- auto_optimize_vocab, auto_use_query_optimization, auto_use_reranking
1087
- ],
1088
- outputs=[auto_results_output, auto_stats_output, recommendations_output]
1089
- )
1090
- ###
1091
 
1092
 
1093
  use_case_md = """
@@ -1491,33 +1572,132 @@ if __name__ == "__main__":
1491
 
1492
  iface.launch(share=share)
1493
 
1494
- def run_automated_tests_and_analyze(*args):
1495
- file, query, auto_expected_result_input, model_types, model_names, split_strategies, chunk_sizes, overlap_sizes, \
1496
- vector_store_types, search_types, top_k_values, optimize_vocab, use_query_optimization, use_reranking = args
1497
-
1498
- test_params = {
1499
- 'model_type': model_types,
1500
- 'model_name': [name.strip() for name in model_names.split(',')],
1501
- 'split_strategy': split_strategies,
1502
- 'chunk_size': [int(size.strip()) for size in chunk_sizes.split(',') if size.strip()],
1503
- 'overlap_size': [int(size.strip()) for size in overlap_sizes.split(',') if size.strip()],
1504
- 'vector_store_type': vector_store_types,
1505
- 'search_type': search_types,
1506
- 'top_k': [int(k.strip()) for k in top_k_values.split(',')],
1507
- 'lang': ['german'], # You can expand this if needed
1508
- 'apply_preprocessing': [True],
1509
- 'optimize_vocab': [optimize_vocab],
1510
- 'apply_phonetic': [True],
1511
- 'phonetic_weight': [0.3],
1512
- 'use_query_optimization': [use_query_optimization],
1513
- 'query_optimization_model': ['google/flan-t5-base'],
1514
- 'use_reranking': [use_reranking]
1515
- }
1516
-
1517
- results_df, stats_df = automated_testing(file, query, test_params, auto_expected_result_input)
1518
- recommendations = analyze_results(stats_df)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1519
 
1520
- return results_df, stats_df, recommendations
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1521
 
1522
  if __name__ == "__main__":
1523
  launch_interface()
 
74
  # Model Management
75
  class ModelManager:
76
  def __init__(self):
77
+ self.rankings: Dict[str, float] = {}
78
+ self.model_stats: Dict[str, Dict[str, Any]] = {}
79
  self.models = {
80
  'HuggingFace': {
81
  'e5-base-de': "danielheinz/e5-base-sts-en-de",
 
92
  }
93
  }
94
 
95
+
96
+ def update_model_ranking(self, model_id: str, score: float, feedback: Optional[str] = None):
97
+ """Update model ranking based on performance and optional feedback"""
98
+ current_score = self.rankings.get(model_id, 0.0)
99
+ # Weighted average of current score and new score
100
+ self.rankings[model_id] = 0.7 * current_score + 0.3 * score
101
+
102
+ if feedback:
103
+ if model_id not in self.model_stats:
104
+ self.model_stats[model_id] = {"feedback_count": 0, "feedback": []}
105
+ self.model_stats[model_id]["feedback_count"] += 1
106
+ self.model_stats[model_id]["feedback"].append(feedback)
107
+
108
+ def get_top_models(self, n: int = 5) -> List[Tuple[str, float]]:
109
+ """Get top n ranked models"""
110
+ return sorted(self.rankings.items(), key=lambda x: x[1], reverse=True)[:n]
111
+
112
+ def get_model_stats(self, model_id: str) -> Dict[str, Any]:
113
+ """Get statistics for a specific model"""
114
+ return self.model_stats.get(model_id, {})
115
+
116
+
117
  def add_model(self, provider, name, model_path):
118
  if provider not in self.models:
119
  self.models[provider] = {}
 
221
  def simple_tokenize(text):
222
  return text.split()
223
 
224
+ def preprocess_text(text, lang='german', apply_preprocessing=False):
225
  if not apply_preprocessing:
226
  return text
227
 
 
249
 
250
  return ' '.join(tokens)
251
 
252
+ def phonetic_match(text, query, method='levenshtein_distance', apply_phonetic=False):
253
  if not apply_phonetic:
254
  return 0
255
  if method == 'levenshtein_distance':
 
414
 
415
 
416
  # Main Processing Functions
417
+ def process_files(file_path, model_type, model_name, split_strategy, chunk_size, overlap_size, custom_separators, lang='german', apply_preprocessing=False, custom_tokenizer_file=None, custom_tokenizer_model=None, custom_tokenizer_vocab_size=10000, custom_tokenizer_special_tokens=None):
418
  if file_path:
419
  text = FileHandler.extract_text(file_path)
420
  else:
 
436
 
437
  return chunks, embedding_model, len(text.split())
438
 
439
+ def search_embeddings(chunks, embedding_model, vector_store_type, search_type, query, top_k, expected_result=None, lang='german', apply_phonetic=False, phonetic_weight=0.3):
440
  preprocessed_query = preprocess_text(query, lang) if apply_phonetic else query
441
 
442
  vector_store = get_vector_store(vector_store_type, chunks, embedding_model)
 
445
  start_time = time.time()
446
  results = retriever.invoke(preprocessed_query)
447
 
448
+ #this should be optional
449
  def score_result(doc):
450
  base_score = vector_store.similarity_search_with_score(doc.page_content, k=1)[0][1]
451
 
 
477
 
478
  return results_df, end_time - start_time, vector_store, results
479
 
480
+ # Enhanced Result Analysis
481
+ class ResultAnalyzer:
482
+ @staticmethod
483
+ def calculate_statistics(results, search_time, vector_store, num_tokens, embedding_model, query,
484
+ top_k, expected_result=None, model_feedback=None):
485
+ stats = {
486
+ "num_results": len(results),
487
+ "avg_content_length": np.mean([len(doc.page_content) for doc in results]) if results else 0,
488
+ "min_content_length": min([len(doc.page_content) for doc in results]) if results else 0,
489
+ "max_content_length": max([len(doc.page_content) for doc in results]) if results else 0,
490
+ "search_time": search_time,
491
+ "num_tokens": num_tokens,
492
+ "embedding_dimension": len(embedding_model.embed_query(query)),
493
+ "top_k": top_k,
494
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
495
 
496
+ # Add vector store statistics
497
+ try:
498
+ if hasattr(vector_store, '_index'):
499
+ stats["vector_store_size"] = vector_store._index.ntotal
500
+ elif hasattr(vector_store, '_collection'):
501
+ stats["vector_store_size"] = len(vector_store._collection.get())
502
+ except:
503
+ stats["vector_store_size"] = "N/A"
504
+
505
+ # Add expected result statistics if provided
506
+ if expected_result:
507
+ stats["contains_expected"] = any(expected_result in doc.page_content for doc in results)
508
+ stats["expected_result_rank"] = next((i for i, doc in enumerate(results)
509
+ if expected_result in doc.page_content), -1) + 1
510
+
511
+ # Calculate diversity metrics for larger result sets
512
+ if len(results) > 3: # Changed from 1000 to make it more practical
513
+ embeddings = [embedding_model.embed_query(doc.page_content) for doc in results]
514
+ stats["result_diversity"] = ResultAnalyzer._calculate_diversity(embeddings)
515
+ stats["silhouette_score"] = ResultAnalyzer._calculate_silhouette(embeddings)
516
  else:
517
+ stats["result_diversity"] = "N/A"
518
  stats["silhouette_score"] = "N/A"
519
+
520
+ # Add ranking correlation
521
+ query_embedding = embedding_model.embed_query(query)
522
+ result_embeddings = [embedding_model.embed_query(doc.page_content) for doc in results]
523
+ similarities = [np.inner(query_embedding, emb) for emb in result_embeddings]
524
+ if len(similarities) > 1:
525
+ rank_correlation, _ = spearmanr(similarities, range(len(similarities)))
526
+ stats["rank_correlation"] = rank_correlation
527
+ else:
528
+ stats["rank_correlation"] = "N/A"
529
+
530
+ # Add model feedback if provided
531
+ if model_feedback:
532
+ stats["model_feedback"] = model_feedback
533
+
534
+ return stats
535
 
536
+ @staticmethod
537
+ def _calculate_diversity(embeddings: List[np.ndarray]) -> float:
538
+ """Calculate diversity score for embeddings"""
539
+ embeddings_array = np.array(embeddings)
540
+ pairwise_similarities = np.inner(embeddings_array, embeddings_array)
541
+ return 1 - np.mean(pairwise_similarities[np.triu_indices(len(embeddings), k=1)])
542
 
543
+ @staticmethod
544
+ def _calculate_silhouette(embeddings: List[np.ndarray]) -> float:
545
+ """Calculate silhouette score for embeddings"""
546
+ if len(embeddings) < 3:
547
+ return 0.0
548
+ try:
549
+ return silhouette_score(embeddings, range(len(embeddings)))
550
+ except:
551
+ return 0.0
552
+
553
+
554
+
555
+
556
+
557
  # Visualization
558
  def visualize_results(results_df, stats_df):
559
  # Add model column if not present
 
728
 
729
  result_embeddings = [doc.metadata.get('embedding', None) for doc in results_raw]
730
 
731
+ stats = ResultAnalyzer.calculate_statistics(results_raw, search_time, vector_store, num_tokens, embedding_model, query, top_k, expected_result)
732
  stats["model"] = f"{model_type} - {model_name}"
733
  stats["model_type"] = model_type
734
  stats["model_name"] = model_name
 
823
  reranker = pipeline("text-classification", model="cross-encoder/ms-marco-MiniLM-L-12-v2")
824
  results_raw = rerank_results(results_raw, query, reranker)
825
 
826
+ stats = ResultAnalyzer.calculate_statistics(results_raw, search_time, vector_store, num_tokens, embedding_model, query, params['top_k'], expected_result)
827
  stats["model"] = f"{params['model_type']} - {params['model_name']}"
828
  stats["model_type"] = params['model_type']
829
  stats["model_name"] = params['model_name']
 
1029
  search_type_input = gr.Radio(choices=["similarity", "mmr", "custom"], label="Search Type", value="similarity")
1030
  lang_input = gr.Dropdown(choices=["german", "english", "french"], label="Language", value="german")
1031
 
1032
+ with gr.Tab("Expert"):
1033
+ apply_preprocessing_input = gr.Checkbox(label="Apply Text Preprocessing", value=False)
1034
  optimize_vocab_input = gr.Checkbox(label="Optimize Vocabulary", value=False)
1035
+ apply_phonetic_input = gr.Checkbox(label="Apply Phonetic Matching", value=False)
1036
  phonetic_weight_input = gr.Slider(0, 1, step=0.1, value=0.3, label="Phonetic Matching Weight")
1037
  custom_tokenizer_file_input = gr.File(label="Custom Tokenizer File (Optional)")
1038
  custom_tokenizer_model_input = gr.Textbox(label="Custom Tokenizer Model (e.g., WordLevel, BPE, Unigram)")
1039
  custom_tokenizer_vocab_size_input = gr.Textbox(label="Custom Tokenizer Vocab Size", value="10000")
1040
  custom_tokenizer_special_tokens_input = gr.Textbox(label="Custom Tokenizer Special Tokens (comma-separated)")
1041
  use_query_optimization_input = gr.Checkbox(label="Use Query Optimization", value=False)
1042
+ query_optimization_model_input = gr.Textbox(label="Query Optimization Model (google/flan-t5-base) ", value="")
1043
  use_reranking_input = gr.Checkbox(label="Use Reranking", value=False)
1044
 
1045
  with gr.Tab("Automation"):
1046
+
1047
+
1048
+ with gr.Row():
1049
+ auto_file_input = gr.File(label="Upload File (Optional)")
1050
+ auto_query_input = gr.Textbox(label="Search Query")
1051
+
1052
+ with gr.Row():
1053
+ auto_expected_result_input = gr.Textbox(
1054
+ label="Expected Result (Optional)",
1055
+ placeholder="Enter expected text if you want to evaluate accuracy"
1056
+ )
1057
+ model_feedback_input = gr.Textbox(
1058
+ label="Model Feedback (Optional)",
1059
+ placeholder="Enter any feedback about model performance"
1060
+ )
1061
+
1062
+ with gr.Row():
1063
+ with gr.Column():
1064
+ # Default model selection
1065
+ default_models_input = gr.CheckboxGroup(
1066
+ choices=[f"{type}:{name}"
1067
+ for type, names in DEFAULT_MODELS.items()
1068
+ for name in names],
1069
+ label="Default Models",
1070
+ value=[f"HuggingFace:{DEFAULT_MODELS['HuggingFace'][0]}"]
1071
+ )
1072
+
1073
+ with gr.Column():
1074
+ # Custom model input
1075
+ custom_models_input = gr.TextArea(
1076
+ label="Custom Models (Optional)",
1077
+ placeholder="Enter one model per line in format: type:name",
1078
+ lines=3
1079
+ )
1080
+
1081
  auto_split_strategies = gr.CheckboxGroup(
1082
  choices=["token", "recursive"],
1083
  label="Split Strategies to Test"
 
1097
  auto_use_query_optimization = gr.Checkbox(label="Test Query Optimization", value=True)
1098
  auto_use_reranking = gr.Checkbox(label="Test Reranking", value=True)
1099
 
1100
+
1101
+ auto_results_output = gr.Dataframe(label="Automated Test Results", interactive=False)
1102
+ auto_stats_output = gr.Dataframe(label="Automated Test Statistics", interactive=False)
1103
+ recommendations_output = gr.JSON(label="Recommendations")
1104
+
1105
+ auto_submit_button = gr.Button("Run Automated Tests")
1106
+ auto_submit_button.click(
1107
+ fn=lambda *args: run_automated_tests(*args),
1108
+ inputs=[
1109
+ auto_file_input, auto_query_input, auto_expected_result_input, auto_model_types, auto_model_names,
1110
+ auto_split_strategies, auto_chunk_sizes, auto_overlap_sizes,
1111
+ auto_vector_store_types, auto_search_types, auto_top_k,
1112
+ auto_optimize_vocab, auto_use_query_optimization, auto_use_reranking
1113
+ ],
1114
+ outputs=[auto_results_output, auto_stats_output, recommendations_output]
1115
+ )
1116
+ ###
1117
+
1118
+ with gr.Tab("Results"):
1119
+ with gr.Row():
1120
+ results_output = gr.DataFrame(label="Results")
1121
+ stats_output = gr.DataFrame(label="Statistics")
1122
+
1123
+ with gr.Row():
1124
+ plot_output = gr.Plot(label="Visualizations")
1125
+ model_rankings_output = gr.JSON(label="Model Rankings")
1126
+
1127
+ with gr.Row():
1128
+ recommendations_output = gr.JSON(label="Recommendations")
1129
+
1130
  with gr.Tab("LLM Suggestions"):
1131
  llm_file_input = gr.File(label="Upload File for LLM Suggestions")
1132
  llm_num_chunks = gr.Slider(1, 10, step=1, value=5, label="Number of Sample Chunks")
 
1169
  outputs=[results_output, stats_output, plot_output, best_settings_output]
1170
  )
1171
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1172
 
1173
 
1174
  use_case_md = """
 
1572
 
1573
  iface.launch(share=share)
1574
 
1575
+ # Enhanced Automated Testing
1576
+ def run_automated_tests(file_path: str, query: str, model_configs: List[Dict[str, str]],
1577
+ test_params: Dict[str, List[Any]], expected_result: Optional[str] = None,
1578
+ model_feedback: Optional[str] = None) -> Tuple[pd.DataFrame, pd.DataFrame]:
1579
+ """
1580
+ Enhanced automated testing function with support for custom models and feedback
1581
+ """
1582
+ all_results = []
1583
+ all_stats = []
1584
+ model_manager = ModelManager()
1585
+
1586
+ # Create parameter grid excluding model configurations
1587
+ base_params = {k: v for k, v in test_params.items() if k not in ['model_type', 'model_name']}
1588
+ param_grid = ParameterGrid(base_params)
1589
+
1590
+ # Test each model configuration with all parameter combinations
1591
+ for model_config in tqdm(model_configs, desc="Testing models"):
1592
+ model_type = model_config['type']
1593
+ model_name = model_config['name']
1594
+
1595
+ for params in tqdm(param_grid, desc=f"Testing parameters for {model_type}:{model_name}"):
1596
+ try:
1597
+ # Process files and get chunks
1598
+ chunks, embedding_model, num_tokens = process_files(
1599
+ file_path,
1600
+ model_type,
1601
+ model_name,
1602
+ params['split_strategy'],
1603
+ params['chunk_size'],
1604
+ params['overlap_size'],
1605
+ params.get('custom_separators'),
1606
+ params['lang'],
1607
+ params['apply_preprocessing']
1608
+ )
1609
+
1610
+ # Apply vocabulary optimization if specified
1611
+ if params['optimize_vocab']:
1612
+ tokenizer, chunks = optimize_vocabulary(chunks)
1613
+
1614
+ # Apply query optimization if specified
1615
+ current_query = query
1616
+ if params['use_query_optimization']:
1617
+ optimized_queries = optimize_query(
1618
+ query,
1619
+ params['query_optimization_model'],
1620
+ chunks,
1621
+ embedding_model,
1622
+ params['vector_store_type'],
1623
+ params['search_type'],
1624
+ params['top_k']
1625
+ )
1626
+ current_query = " ".join(optimized_queries)
1627
+
1628
+ # Perform search
1629
+ results, search_time, vector_store, raw_results = search_embeddings(
1630
+ chunks,
1631
+ embedding_model,
1632
+ params['vector_store_type'],
1633
+ params['search_type'],
1634
+ current_query,
1635
+ params['top_k'],
1636
+ expected_result,
1637
+ params['lang'],
1638
+ params['apply_phonetic'],
1639
+ params['phonetic_weight']
1640
+ )
1641
+
1642
+ # Apply reranking if specified
1643
+ if params['use_reranking']:
1644
+ reranker = pipeline("text-classification",
1645
+ model="cross-encoder/ms-marco-MiniLM-L-12-v2")
1646
+ raw_results = rerank_results(raw_results, current_query, reranker)
1647
+
1648
+ # Calculate statistics
1649
+ stats = ResultAnalyzer.calculate_statistics(
1650
+ raw_results, search_time, vector_store, num_tokens,
1651
+ embedding_model, current_query, params['top_k'],
1652
+ expected_result, model_feedback
1653
+ )
1654
+
1655
+ # Update model rankings
1656
+ model_id = f"{model_type}:{model_name}"
1657
+ ranking_score = calculate_model_ranking_score(stats)
1658
+ model_manager.update_model_ranking(model_id, ranking_score, model_feedback)
1659
+
1660
+ # Add model information to stats
1661
+ stats.update({
1662
+ "model_type": model_type,
1663
+ "model_name": model_name,
1664
+ "model": f"{model_type} - {model_name}",
1665
+ **params
1666
+ })
1667
+
1668
+ # Format and store results
1669
+ all_results.extend(format_results(raw_results, stats))
1670
+ all_stats.append(stats)
1671
+
1672
+ except Exception as e:
1673
+ print(f"Error testing {model_type}:{model_name} with parameters {params}: {str(e)}")
1674
+ continue
1675
+
1676
+ return pd.DataFrame(all_results), pd.DataFrame(all_stats)
1677
 
1678
+ # Helper function to calculate model ranking score
1679
+ def calculate_model_ranking_score(stats: Dict[str, Any]) -> float:
1680
+ """Calculate a composite score for model ranking"""
1681
+ weights = {
1682
+ 'search_time': -0.2, # Negative weight because lower is better
1683
+ 'result_diversity': 0.2,
1684
+ 'rank_correlation': 0.3,
1685
+ 'contains_expected': 0.3,
1686
+ 'expected_result_rank': -0.2 # Negative weight because lower rank is better
1687
+ }
1688
+
1689
+ score = 0.0
1690
+ for metric, weight in weights.items():
1691
+ if metric in stats and not isinstance(stats[metric], str):
1692
+ if metric == 'contains_expected':
1693
+ value = float(stats[metric])
1694
+ elif metric == 'expected_result_rank':
1695
+ value = 1.0 / max(stats[metric], 1) # Convert rank to score (higher is better)
1696
+ else:
1697
+ value = float(stats[metric])
1698
+ score += weight * value
1699
+
1700
+ return score
1701
 
1702
  if __name__ == "__main__":
1703
  launch_interface()