Chris4K commited on
Commit
aa72e55
1 Parent(s): 97c3e76

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +246 -224
app.py CHANGED
@@ -41,8 +41,8 @@ from huggingface_hub import login
41
  from typing import List, Tuple, Optional
42
 
43
 
44
- hf_token = os.getenv("hf_token")
45
- login(token=hf_token)
46
 
47
  # Define the model pipeline with additional generation parameters
48
  #model_pipeline = pipeline(
@@ -154,28 +154,28 @@ class ModelManager:
154
  }
155
  }
156
 
157
-
158
  def update_model_ranking(self, model_id: str, score: float, feedback: str = None):
159
  """Update model ranking based on performance and optional feedback"""
160
  current_score = self.rankings.get(model_id, 0.0)
161
  # Weighted average of current score and new score
162
  self.rankings[model_id] = 0.7 * current_score + 0.3 * score
163
-
164
  if feedback:
165
  if model_id not in self.model_stats:
166
  self.model_stats[model_id] = {"feedback_count": 0, "feedback": []}
167
  self.model_stats[model_id]["feedback_count"] += 1
168
  self.model_stats[model_id]["feedback"].append(feedback)
169
-
170
  def get_top_models(self, n: int = 5) -> List[Tuple[str, float]]:
171
  """Get top n ranked models"""
172
  return sorted(self.rankings.items(), key=lambda x: x[1], reverse=True)[:n]
173
-
174
  def get_model_stats(self, model_id: str) -> Dict[str, Any]:
175
  """Get statistics for a specific model"""
176
  return self.model_stats.get(model_id, {})
177
 
178
-
179
  def add_model(self, provider, name, model_path):
180
  if provider not in self.models:
181
  self.models[provider] = {}
@@ -286,29 +286,29 @@ def simple_tokenize(text):
286
  def preprocess_text(text, lang='german', apply_preprocessing=False):
287
  if not apply_preprocessing:
288
  return text
289
-
290
  text = text.lower()
291
  text = re.sub(r'[^a-zA-Z\s]', '', text)
292
-
293
  try:
294
  tokens = word_tokenize(text, language=lang)
295
  except LookupError:
296
  print(f"Warning: NLTK punkt tokenizer for {lang} not found. Using simple tokenization.")
297
  tokens = simple_tokenize(text)
298
-
299
  try:
300
  stop_words = set(stopwords.words(lang))
301
  except LookupError:
302
  print(f"Warning: Stopwords for {lang} not found. Skipping stopword removal.")
303
  stop_words = set()
304
  tokens = [token for token in tokens if token not in stop_words]
305
-
306
  try:
307
  stemmer = SnowballStemmer(lang)
308
  tokens = [stemmer.stem(token) for token in tokens]
309
  except ValueError:
310
  print(f"Warning: SnowballStemmer for {lang} not available. Skipping stemming.")
311
-
312
  return ' '.join(tokens)
313
 
314
  def phonetic_match(text, query, method='levenshtein_distance', apply_phonetic=False):
@@ -341,7 +341,7 @@ def optimize_query(
341
  ) -> str:
342
  """
343
  CPU-optimized version of query expansion using a small language model.
344
-
345
  Args:
346
  query: Original search query
347
  query_optimization_model: Name or path of the model to use for optimization
@@ -351,17 +351,17 @@ def optimize_query(
351
  search_type: Type of search being performed
352
  top_k: Number of expansion terms to add
353
  use_gpu: Whether to use GPU if available (defaults to False for CPU)
354
-
355
  Returns:
356
  Expanded query string
357
  """
358
  try:
359
  # Set device
360
  device = "cuda" if use_gpu and torch.cuda.is_available() else "cpu"
361
-
362
  # 1. Basic text preprocessing (CPU-based)
363
  tokens = word_tokenize(query.lower())
364
-
365
  # 2. WordNet synonyms expansion (CPU-based)
366
  expanded_terms = set()
367
  for token in tokens:
@@ -370,7 +370,7 @@ def optimize_query(
370
  for syn in synsets:
371
  # Limit number of lemmas
372
  expanded_terms.update([lemma.name() for lemma in syn.lemmas()[:2]])
373
-
374
  # 3. Use provided model with reduced complexity
375
  try:
376
  # Load model with reduced memory footprint
@@ -384,11 +384,11 @@ def optimize_query(
384
  low_cpu_mem_usage=True,
385
  device_map="cpu"
386
  )
387
-
388
  # Move model to CPU and eval mode
389
  model = model.to(device)
390
  model.eval()
391
-
392
  # Prepare input with reduced length
393
  prompt = f"Enhance this search query with relevant terms: {query}"
394
  inputs = tokenizer(
@@ -398,7 +398,7 @@ def optimize_query(
398
  truncation=True,
399
  padding=True
400
  )
401
-
402
  # Generate with minimal parameters
403
  with torch.no_grad():
404
  outputs = model.generate(
@@ -409,41 +409,41 @@ def optimize_query(
409
  do_sample=False,
410
  early_stopping=True
411
  )
412
-
413
  enhanced_query = tokenizer.decode(outputs[0], skip_special_tokens=True)
414
-
415
  # Clear CUDA cache if GPU was used
416
  if device == "cuda":
417
  torch.cuda.empty_cache()
418
-
419
  except Exception as model_error:
420
  print(f"Model-based expansion failed: {str(model_error)}")
421
  enhanced_query = query
422
-
423
  # 4. Combine original and expanded terms
424
  final_terms = set(tokens)
425
  final_terms.update(expanded_terms)
426
  if enhanced_query != query:
427
  final_terms.update(word_tokenize(enhanced_query.lower()))
428
-
429
  # 5. Remove stopwords and select top_k most relevant terms
430
  stopwords = set(['the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to'])
431
  final_terms = [term for term in final_terms if term not in stopwords]
432
-
433
  # Combine with original query
434
  expanded_query = f"{query} {' '.join(list(final_terms)[:top_k])}"
435
-
436
  # Clean up
437
  del model
438
  del tokenizer
439
  if device == "cuda":
440
  torch.cuda.empty_cache()
441
-
442
- return [Document(page_content=expanded_query.strip())]
443
-
444
  except Exception as e:
445
  print(f"Query optimization failed: {str(e)}")
446
- return [Document(page_content=query)] # Return original query if optimization fails
447
 
448
 
449
 
@@ -458,27 +458,27 @@ optimized_query = optimize_query(
458
  use_gpu=False # Explicitly use CPU
459
  )
460
  """
461
-
462
 
463
  def create_custom_embedding(texts, model_type='word2vec', vector_size=100, window=5, min_count=1):
464
  tokenized_texts = [text.split() for text in texts]
465
-
466
  if model_type == 'word2vec':
467
  model = Word2Vec(sentences=tokenized_texts, vector_size=vector_size, window=window, min_count=min_count, workers=4)
468
  elif model_type == 'fasttext':
469
  model = FastText(sentences=tokenized_texts, vector_size=vector_size, window=window, min_count=min_count, workers=4)
470
  else:
471
  raise ValueError("Unsupported model type")
472
-
473
  return model
474
 
475
  class CustomEmbeddings(HuggingFaceEmbeddings):
476
  def __init__(self, model_path):
477
  self.model = Word2Vec.load(model_path) # or FastText.load() for FastText models
478
-
479
  def embed_documents(self, texts):
480
  return [self.model.wv[text.split()] for text in texts]
481
-
482
  def embed_query(self, text):
483
  return self.model.wv[text.split()]
484
 
@@ -520,7 +520,7 @@ def get_text_splitter(split_strategy, chunk_size, overlap_size, custom_separator
520
  chunk_size=chunk_size,
521
  chunk_overlap=overlap_size,
522
  add_start_index=True, # If `True`, includes chunk's start index in metadata
523
- strip_whitespace=True, # If `True`, strips whitespace from the start and end of every document
524
  separators=custom_separators or ["\n\n", "\n", " ", ""]
525
  )
526
  else:
@@ -534,7 +534,7 @@ def get_embedding_model(model_type, model_name):
534
  multi_process=True,
535
  # model_kwargs={"device": "cpu"},
536
  #encode_kwargs={"normalize_embeddings": True}, # Set `True` for cosine similarity
537
- )
538
  elif model_type == 'OpenAI':
539
  return OpenAIEmbeddings(model=model_path)
540
  elif model_type == 'Cohere':
@@ -566,10 +566,10 @@ def custom_similarity(query_embedding, doc_embedding, query, doc_text, phonetic_
566
  phonetic_sim = phonetic_match(doc_text, query)
567
  combined_sim = (1 - phonetic_weight) * embedding_sim + phonetic_weight * phonetic_sim
568
  return combined_sim
569
-
570
  def _create_vector_store(vector_store_type, chunks_tuple, embedding_model):
571
  chunks = list(chunks_tuple)
572
-
573
  if vector_store_type == 'FAISS':
574
  return FAISS.from_texts(chunks, embedding_model)
575
  elif vector_store_type == 'Chroma':
@@ -587,7 +587,7 @@ def process_files(file_path, model_type, model_name, split_strategy, chunk_size,
587
  for file in os.listdir(FILES_DIR):
588
  file_path = os.path.join(FILES_DIR, file)
589
  text += FileHandler.extract_text(file_path)
590
-
591
  if custom_tokenizer_file:
592
  tokenizer = create_custom_tokenizer(custom_tokenizer_file, custom_tokenizer_model, custom_tokenizer_vocab_size, custom_tokenizer_special_tokens)
593
  text = ' '.join(custom_tokenize(text, tokenizer))
@@ -603,7 +603,7 @@ def process_files(file_path, model_type, model_name, split_strategy, chunk_size,
603
 
604
  def search_embeddings(chunks, embedding_model, vector_store_type, search_type, query, top_k, expected_result=None, lang='german', apply_phonetic=False, phonetic_weight=0.3):
605
  preprocessed_query = preprocess_text(query, lang) if apply_phonetic else query
606
-
607
  vector_store = get_vector_store(vector_store_type, chunks, embedding_model)
608
  retriever = get_retriever(vector_store, search_type, {"k": top_k})
609
 
@@ -613,10 +613,10 @@ def search_embeddings(chunks, embedding_model, vector_store_type, search_type, q
613
  #this should be optional
614
  def score_result(doc):
615
  base_score = vector_store.similarity_search_with_score(doc.page_content, k=1)[0][1]
616
-
617
  # Add bonus for containing expected result
618
  expected_bonus = 0.3 if expected_result and expected_result in doc.page_content else 0
619
-
620
  if apply_phonetic:
621
  phonetic_score = phonetic_match(doc.page_content, query)
622
  return (1 - phonetic_weight) * base_score + phonetic_weight * phonetic_score + expected_bonus
@@ -645,7 +645,7 @@ def search_embeddings(chunks, embedding_model, vector_store_type, search_type, q
645
  # Enhanced Result Analysis
646
  class ResultAnalyzer:
647
  @staticmethod
648
- def calculate_statistics(results, search_time, vector_store, num_tokens, embedding_model, query,
649
  top_k, expected_result=None, model_feedback=None):
650
  stats = {
651
  "num_results": len(results),
@@ -657,7 +657,7 @@ class ResultAnalyzer:
657
  "embedding_dimension": len(embedding_model.embed_query(query)),
658
  "top_k": top_k,
659
  }
660
-
661
  # Add vector store statistics
662
  try:
663
  if hasattr(vector_store, '_index'):
@@ -666,13 +666,13 @@ class ResultAnalyzer:
666
  stats["vector_store_size"] = len(vector_store._collection.get())
667
  except:
668
  stats["vector_store_size"] = "N/A"
669
-
670
  # Add expected result statistics if provided
671
  if expected_result:
672
  stats["contains_expected"] = any(expected_result in doc.page_content for doc in results)
673
- stats["expected_result_rank"] = next((i for i, doc in enumerate(results)
674
  if expected_result in doc.page_content), -1) + 1
675
-
676
  # Calculate diversity metrics for larger result sets
677
  if len(results) > 3: # Changed from 1000 to make it more practical
678
  embeddings = [embedding_model.embed_query(doc.page_content) for doc in results]
@@ -681,7 +681,7 @@ class ResultAnalyzer:
681
  else:
682
  stats["result_diversity"] = "N/A"
683
  stats["silhouette_score"] = "N/A"
684
-
685
  # Add ranking correlation
686
  query_embedding = embedding_model.embed_query(query)
687
  result_embeddings = [embedding_model.embed_query(doc.page_content) for doc in results]
@@ -691,20 +691,20 @@ class ResultAnalyzer:
691
  stats["rank_correlation"] = rank_correlation
692
  else:
693
  stats["rank_correlation"] = "N/A"
694
-
695
  # Add model feedback if provided
696
  if model_feedback:
697
  stats["model_feedback"] = model_feedback
698
-
699
  return stats
700
-
701
  @staticmethod
702
  def _calculate_diversity(embeddings: List[np.ndarray]) -> float:
703
  """Calculate diversity score for embeddings"""
704
  embeddings_array = np.array(embeddings)
705
  pairwise_similarities = np.inner(embeddings_array, embeddings_array)
706
  return 1 - np.mean(pairwise_similarities[np.triu_indices(len(embeddings), k=1)])
707
-
708
  @staticmethod
709
  def _calculate_silhouette(embeddings: List[np.ndarray]) -> float:
710
  """Calculate silhouette score for embeddings"""
@@ -724,13 +724,13 @@ def visualize_results(results_df, stats_df):
724
  # Add model column if not present
725
  if 'model' not in stats_df.columns:
726
  stats_df['model'] = stats_df['model_type'] + ' - ' + stats_df['model_name']
727
-
728
  fig, axs = plt.subplots(2, 2, figsize=(20, 20))
729
-
730
  # Handle empty dataframe case
731
  if len(stats_df) == 0:
732
  return fig
733
-
734
  # Create plots with error handling
735
  try:
736
  sns.barplot(data=stats_df, x='model', y='search_time', ax=axs[0, 0])
@@ -738,36 +738,36 @@ def visualize_results(results_df, stats_df):
738
  axs[0, 0].tick_params(axis='x', rotation=45)
739
  except Exception as e:
740
  print(f"Error in search time plot: {e}")
741
-
742
  try:
743
- sns.scatterplot(data=stats_df, x='result_diversity', y='rank_correlation',
744
  hue='model', ax=axs[0, 1])
745
  axs[0, 1].set_title('Result Diversity vs. Rank Correlation')
746
  except Exception as e:
747
  print(f"Error in diversity plot: {e}")
748
-
749
  try:
750
  sns.boxplot(data=stats_df, x='model', y='avg_content_length', ax=axs[1, 0])
751
  axs[1, 0].set_title('Distribution of Result Content Lengths')
752
  axs[1, 0].tick_params(axis='x', rotation=45)
753
  except Exception as e:
754
  print(f"Error in content length plot: {e}")
755
-
756
  try:
757
  valid_embeddings = results_df['embedding'].dropna().values
758
  if len(valid_embeddings) > 1:
759
  tsne = TSNE(n_components=2, random_state=42)
760
  embeddings_2d = tsne.fit_transform(np.vstack(valid_embeddings))
761
- sns.scatterplot(x=embeddings_2d[:, 0], y=embeddings_2d[:, 1],
762
- hue=results_df['Model'][:len(valid_embeddings)],
763
  ax=axs[1, 1])
764
  axs[1, 1].set_title('t-SNE Visualization of Result Embeddings')
765
  else:
766
- axs[1, 1].text(0.5, 0.5, "Not enough embeddings for visualization",
767
  ha='center', va='center')
768
  except Exception as e:
769
  print(f"Error in embedding visualization: {e}")
770
-
771
  plt.tight_layout()
772
  return fig
773
 
@@ -778,56 +778,56 @@ def visualize_results(results_df, stats_df):
778
  #plt.title("Distribution of document lengths in the knowledge base (in count of tokens)")
779
  #plt.show()
780
 
781
-
782
  def optimize_vocabulary(texts, vocab_size=10000, min_frequency=2):
783
  tokenizer = Tokenizer(models.BPE(unk_token="[UNK]"))
784
 
785
  word_freq = Counter(word for text in texts for word in text.split())
786
-
787
  optimized_texts = [
788
  ' '.join(word for word in text.split() if word_freq[word] >= min_frequency)
789
  for text in texts
790
  ]
791
-
792
  trainer = trainers.BpeTrainer(vocab_size=vocab_size, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
793
  tokenizer.train_from_iterator(optimized_texts, trainer)
794
-
795
  return tokenizer, optimized_texts
796
-
797
  import numpy as np
798
  from transformers import TextClassificationPipeline
799
  from typing import List, Union, Any
800
 
801
-
802
 
803
  model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
804
 
805
 
806
  def rerank_results(
807
- results: List[Any],
808
- query: str,
809
  reranker: Union[TextClassificationPipeline, Any]
810
  ) -> List[Any]:
811
  """
812
-
813
  """
814
  if not results:
815
  return results
816
-
817
  # Step 1: Encode the query and documents using SentenceTransformer
818
  query_embedding = model.encode(query, convert_to_tensor=True)
819
  doc_contents = [doc.page_content for doc in results] # Assuming each result has a `page_content` attribute
820
  doc_embeddings = model.encode(doc_contents, convert_to_tensor=True)
821
-
822
  # Step 2: Compute cosine similarities between query and document embeddings
823
  cosine_scores = util.cos_sim(query_embedding, doc_embeddings)[0] # Shape: (number of documents,)
824
-
825
  # Step 3: Sort documents by similarity score in descending order
826
- reranked_idx = np.argsort(cosine_scores.numpy())[::-1]
827
-
828
  # Step 4: Return the reranked documents
829
  reranked_results = [results[i] for i in reranked_idx]
830
-
831
  return reranked_results
832
 
833
 
@@ -878,13 +878,13 @@ def compare_embeddings(file, query, embedding_models, custom_embedding_model, sp
878
  if optimize_vocab:
879
  tokenizer, optimized_chunks = optimize_vocabulary(chunks)
880
  chunks = optimized_chunks
881
-
882
  search_query = query
883
-
884
  if use_query_optimization:
885
  optimized_queries = optimize_query(query, query_optimization_model, chunks, embedding_model, vector_store_type, search_type, top_k)
886
  #query = " ".join(optimized_queries)
887
- search_query = " ".join([doc.page_content for doc in optimized_queries]) # Extract text from Document objects
888
 
889
  results, search_time, vector_store, results_raw = search_embeddings(
890
  chunks,
@@ -897,8 +897,8 @@ def compare_embeddings(file, query, embedding_models, custom_embedding_model, sp
897
  lang,
898
  apply_phonetic,
899
  phonetic_weight
900
- )
901
-
902
  if use_reranking:
903
  reranker = pipeline("text-classification", model="cross-encoder/ms-marco-MiniLM-L-12-v2")
904
  results_raw = rerank_results(results_raw, query, reranker)
@@ -953,7 +953,7 @@ from tqdm import tqdm
953
  def automated_testing(file, query, test_params, expected_result=None):
954
  all_results = []
955
  all_stats = []
956
-
957
  param_grid = ParameterGrid(test_params)
958
  print(param_grid)
959
  for params in tqdm(param_grid, desc="Running tests"):
@@ -995,7 +995,7 @@ def automated_testing(file, query, test_params, expected_result=None):
995
  params['apply_phonetic'],
996
  params['phonetic_weight']
997
  )
998
-
999
  if params['use_reranking']:
1000
  reranker = pipeline("text-classification", model="cross-encoder/ms-marco-MiniLM-L-12-v2")
1001
  results_raw = rerank_results(results_raw, query, reranker)
@@ -1022,17 +1022,27 @@ def analyze_results(stats_df):
1022
  'contains_expected': 0.5, # High weight for containing the expected result
1023
  'expected_result_rank': -0.4 # Lower rank (closer to 1) is better
1024
  }
1025
-
 
 
 
1026
  for metric in metric_weights.keys():
1027
- stats_df[metric] = pd.to_numeric(stats_df[metric], errors='coerce')
1028
-
 
 
 
 
 
 
 
1029
  stats_df['weighted_score'] = sum(
1030
- stats_df[metric].fillna(0) * weight
1031
  for metric, weight in metric_weights.items()
1032
  )
1033
-
1034
  best_config = stats_df.loc[stats_df['weighted_score'].idxmax()]
1035
-
1036
  recommendations = {
1037
  'best_model': f"{best_config['model_type']} - {best_config['model_name']}",
1038
  'best_settings': {
@@ -1059,7 +1069,7 @@ def analyze_results(stats_df):
1059
  'expected_result_rank': int(best_config['expected_result_rank'])
1060
  }
1061
  }
1062
-
1063
  return recommendations
1064
 
1065
  ####
@@ -1069,72 +1079,85 @@ def get_llm_suggested_settings(file, num_chunks=1):
1069
  return {"error": "No file uploaded"}
1070
 
1071
  chunks, _, _ = process_files(
1072
- file.name,
1073
- 'HuggingFace',
1074
- 'paraphrase-miniLM',
1075
- 'recursive',
1076
- 250,
1077
  50,
1078
  custom_separators=None
1079
  )
1080
-
1081
  # Select a few random chunks
1082
  sample_chunks = random.sample(chunks, min(num_chunks, len(chunks)))
1083
-
1084
- # Prepare the prompt
1085
- prompt = f"""Given the following text chunks from a document, suggest optimal settings for an embedding-based search system. The settings should include:
1086
-
1087
- 1. Embedding model type and name
1088
- 2. Split strategy (token or recursive)
1089
- 3. Chunk size
1090
- 4. Overlap size
1091
- 5. Vector store type (FAISS or Chroma)
1092
- 6. Search type (similarity, mmr, or custom)
1093
- 7. Top K results to retrieve
1094
- 8. Whether to apply preprocessing
1095
- 9. Whether to optimize vocabulary
1096
- 10. Whether to apply phonetic matching
1097
-
1098
- Expected output format:
1099
- {{
1100
- "embedding_models": "embedding_model_type:embedding_model_name",
1101
- "split_strategy": "token or recursive",
1102
- "chunk_size": 250,
1103
- "overlap_size": 50,
1104
- "vector_store_type": "FAISS or Chroma",
1105
- "search_type": "similarity, mmr, or custom",
1106
- "top_k": 5,
1107
- "apply_preprocessing": True,
1108
- "optimize_vocab": True,
1109
- "apply_phonetic": False,
1110
- "phonetic_weight": 0.3 # Default value, as it's not in the LLM suggestions
1111
- }}
1112
-
1113
- Text chunks:
1114
- {' '.join(sample_chunks)}
1115
-
1116
- Provide your suggestions in a Python dictionary format."""
1117
-
1118
- # Use a HuggingFace model for text generation
1119
- #model_id = "google/flan-t5-large"
1120
- #tokenizer = AutoTokenizer.from_pretrained(model_id)
1121
- #model = AutoModelForCausalLM.from_pretrained(model_id)
1122
- #pipe = pipeline(
1123
- # "text-generation", model=model, tokenizer=tokenizer, max_new_tokens=512
1124
- #)
1125
- #llm = HuggingFacePipeline(pipeline=pipe)
1126
-
1127
- #llm = HuggingFacePipeline(pipeline(model="HuggingFaceH4/zephyr-7b-beta"))
1128
-
1129
-
1130
- #llm = HuggingFacePipeline.from_model_id(
1131
- # model_id="google/flan-t5-large",
1132
- # task="text2text-generation",
1133
- # model_kwargs={"do_sample": True, "temperature": 0.7, "max_new_tokens": 512},
1134
- #)
1135
-
1136
- # Generate suggestions
1137
- suggested_settings = llm.invoke(prompt)
 
 
 
 
 
 
 
 
 
 
 
 
 
1138
  print("setting suggested")
1139
  print(suggested_settings)
1140
  # Parse the generated text to extract the dictionary
@@ -1160,7 +1183,7 @@ Provide your suggestions in a Python dictionary format."""
1160
  def update_inputs_with_llm_suggestions(suggestions):
1161
  if suggestions is None or "error" in suggestions:
1162
  return [gr.update() for _ in range(11)] # Return no updates if there's an error or None
1163
-
1164
  return [
1165
  gr.update(value=[suggestions["embedding_models"]]), # embedding_models_input
1166
  gr.update(value=suggestions["split_strategy"]), # split_strategy_input
@@ -1178,16 +1201,16 @@ def update_inputs_with_llm_suggestions(suggestions):
1178
  def parse_model_selections(default_models, custom_models):
1179
  """
1180
  Parse selected default models and custom models into model configurations
1181
-
1182
  Args:
1183
  default_models (List[str]): Selected default models in format "type:name"
1184
  custom_models (str): Custom models string with one model per line in format "type:name"
1185
-
1186
  Returns:
1187
  List[Dict[str, str]]: List of model configurations with 'type' and 'name' keys
1188
  """
1189
  model_configs = []
1190
-
1191
  # Process default models
1192
  if default_models:
1193
  for model in default_models:
@@ -1196,7 +1219,7 @@ def parse_model_selections(default_models, custom_models):
1196
  'type': model_type,
1197
  'name': model_name
1198
  })
1199
-
1200
  # Process custom models
1201
  if custom_models:
1202
  custom_model_lines = custom_models.strip().split('\n')
@@ -1207,7 +1230,7 @@ def parse_model_selections(default_models, custom_models):
1207
  'type': model_type.strip(),
1208
  'name': model_name.strip()
1209
  })
1210
-
1211
  return model_configs
1212
 
1213
  def parse_comma_separated(text):
@@ -1217,12 +1240,12 @@ def parse_comma_separated(text):
1217
  return [x.strip() for x in text.split(',') if x.strip()]
1218
 
1219
 
1220
-
1221
  # Gradio Interface
1222
  def launch_interface(debug=True):
1223
  with gr.Blocks() as iface:
1224
  gr.Markdown("# Advanced Embedding Comparison Tool")
1225
-
1226
  with gr.Tab("Simple"):
1227
  file_input = gr.File(label="Upload File (Optional)")
1228
  query_input = gr.Textbox(label="Search Query")
@@ -1237,7 +1260,7 @@ def launch_interface(debug=True):
1237
  label="Embedding Models"
1238
  )
1239
  top_k_input = gr.Slider(1, 10, step=1, value=5, label="Top K")
1240
-
1241
  with gr.Tab("Advanced"):
1242
  custom_embedding_model_input = gr.Textbox(label="Custom Embedding Model (optional, format: type:name)")
1243
  split_strategy_input = gr.Radio(choices=["token", "recursive"], label="Split Strategy", value="recursive")
@@ -1247,7 +1270,7 @@ def launch_interface(debug=True):
1247
  vector_store_type_input = gr.Radio(choices=["FAISS", "Chroma"], label="Vector Store Type", value="FAISS")
1248
  search_type_input = gr.Radio(choices=["similarity", "mmr", "custom"], label="Search Type", value="similarity")
1249
  lang_input = gr.Dropdown(choices=["german", "english", "french"], label="Language", value="german")
1250
-
1251
  with gr.Tab("Expert"):
1252
  apply_preprocessing_input = gr.Checkbox(label="Apply Text Preprocessing", value=False)
1253
  optimize_vocab_input = gr.Checkbox(label="Optimize Vocabulary", value=False)
@@ -1265,7 +1288,7 @@ def launch_interface(debug=True):
1265
  with gr.Row():
1266
  auto_file_input = gr.File(label="Upload File (Optional)")
1267
  auto_query_input = gr.Textbox(label="Search Query")
1268
-
1269
  with gr.Row():
1270
  auto_expected_result_input = gr.Textbox(
1271
  label="Expected Result (Optional)",
@@ -1275,18 +1298,18 @@ def launch_interface(debug=True):
1275
  label="Model Feedback (Optional)",
1276
  placeholder="Enter any feedback about model performance"
1277
  )
1278
-
1279
  with gr.Row():
1280
  with gr.Column():
1281
  # Default model selection
1282
  default_models_input = gr.CheckboxGroup(
1283
- choices=[f"{type}:{name}"
1284
- for type, names in DEFAULT_MODELS.items()
1285
  for name in names],
1286
  label="Default Models",
1287
  value=[f"HuggingFace:{DEFAULT_MODELS['HuggingFace'][0]}"]
1288
  )
1289
-
1290
  with gr.Column():
1291
  # Custom model input
1292
  custom_models_input = gr.TextArea(
@@ -1294,7 +1317,7 @@ def launch_interface(debug=True):
1294
  placeholder="Enter one model per line in format: type:name",
1295
  lines=3
1296
  )
1297
-
1298
  auto_split_strategies = gr.CheckboxGroup(
1299
  choices=["token", "recursive"],
1300
  label="Split Strategies to Test"
@@ -1313,21 +1336,21 @@ def launch_interface(debug=True):
1313
  auto_optimize_vocab = gr.Checkbox(label="Test Vocabulary Optimization", value=True)
1314
  auto_use_query_optimization = gr.Checkbox(label="Test Query Optimization", value=True)
1315
  auto_use_reranking = gr.Checkbox(label="Test Reranking", value=True)
1316
-
1317
  auto_results_output = gr.Dataframe(label="Automated Test Results", interactive=False)
1318
  auto_stats_output = gr.Dataframe(label="Automated Test Statistics", interactive=False)
1319
  recommendations_output = gr.JSON(label="Recommendations")
1320
-
1321
  def run_automation(file_input, query_input, expected_result, default_models, custom_models,
1322
  split_strategies, chunk_sizes, overlap_sizes,
1323
  vector_store_types, search_types, top_k_values,
1324
  optimize_vocab, use_query_optimization, use_reranking,
1325
  model_feedback):
1326
  """Wrapper function to handle Gradio inputs and run automated tests"""
1327
-
1328
  # Parse model configurations
1329
  model_configs = parse_model_selections(default_models, custom_models)
1330
-
1331
  # Parse test parameters
1332
  test_params = {
1333
  'split_strategy': split_strategies,
@@ -1346,7 +1369,7 @@ def launch_interface(debug=True):
1346
  'custom_separators': [None],
1347
  'query_optimization_model': ['google/flan-t5-base'] # Default query optimization model
1348
  }
1349
-
1350
  # Run automated tests
1351
  results_df, stats_df = run_automated_tests(
1352
  file_input.name if file_input else None,
@@ -1356,12 +1379,12 @@ def launch_interface(debug=True):
1356
  expected_result if expected_result else None,
1357
  model_feedback if model_feedback else None
1358
  )
1359
-
1360
  # Generate recommendations based on results
1361
  recommendations = analyze_results(stats_df)
1362
-
1363
  return results_df, stats_df, recommendations
1364
-
1365
  auto_submit_button = gr.Button("Run Automated Tests")
1366
  auto_submit_button.click(
1367
  fn=run_automation,
@@ -1376,25 +1399,25 @@ def launch_interface(debug=True):
1376
  outputs=[auto_results_output, auto_stats_output, recommendations_output]
1377
  )
1378
  ###
1379
-
1380
  with gr.Tab("Results"):
1381
  with gr.Row():
1382
  results_output = gr.DataFrame(label="Results")
1383
  stats_output = gr.DataFrame(label="Statistics")
1384
-
1385
  with gr.Row():
1386
  plot_output = gr.Plot(label="Visualizations")
1387
  model_rankings_output = gr.JSON(label="Model Rankings")
1388
-
1389
  with gr.Row():
1390
  recommendations_output = gr.JSON(label="Recommendations")
1391
-
1392
  with gr.Tab("LLM Suggestions"):
1393
  llm_file_input = gr.File(label="Upload File for LLM Suggestions")
1394
  llm_num_chunks = gr.Slider(1, 10, step=1, value=5, label="Number of Sample Chunks")
1395
  llm_suggest_button = gr.Button("Get LLM Suggestions")
1396
  llm_suggestions_output = gr.JSON(label="LLM-suggested Settings")
1397
-
1398
  llm_suggest_button.click(
1399
  fn=get_llm_suggested_settings,
1400
  inputs=[llm_file_input, llm_num_chunks],
@@ -1403,9 +1426,9 @@ def launch_interface(debug=True):
1403
  fn=update_inputs_with_llm_suggestions,
1404
  inputs=[llm_suggestions_output],
1405
  outputs=[
1406
- embedding_models_input, split_strategy_input, chunk_size_input,
1407
- overlap_size_input, vector_store_type_input, search_type_input,
1408
- top_k_input, apply_preprocessing_input, optimize_vocab_input,
1409
  apply_phonetic_input, phonetic_weight_input
1410
  ]
1411
  )
@@ -1526,7 +1549,7 @@ Create a simple chat interface and test with various queries about the AI Act. F
1526
  User: "Was sind die Hauptziele des KI-Gesetzes?"
1527
  """
1528
 
1529
-
1530
  tutorial_md = """
1531
  # Advanced Embedding Comparison Tool Tutorial
1532
 
@@ -1675,13 +1698,13 @@ Measures how well an object fits within its own cluster compared to others. Scor
1675
  def create_custom_tokenizer(file_path, model_type='WordLevel', vocab_size=10000, special_tokens=None):
1676
  with open(file_path, 'r', encoding='utf-8') as f:
1677
  text = f.read()
1678
-
1679
  tokenizer = Tokenizer(models.WordLevel(unk_token="[UNK]")) if model_type == 'WordLevel' else Tokenizer(models.BPE(unk_token="[UNK]"))
1680
  tokenizer.pre_tokenizer = Whitespace()
1681
-
1682
  trainer = trainers.WordLevelTrainer(special_tokens=special_tokens or ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"], vocab_size=vocab_size)
1683
  tokenizer.train_from_iterator([text], trainer)
1684
-
1685
  return tokenizer
1686
  ````
1687
 
@@ -1713,39 +1736,39 @@ def rerank_results(results, query, reranker):
1713
 
1714
 
1715
  ## Useful Resources and Links
1716
-
1717
  Here are some valuable resources to help you better understand and work with embeddings, retrieval systems, and natural language processing:
1718
-
1719
  ### Embeddings and Vector Databases
1720
  - [Understanding Embeddings](https://www.tensorflow.org/text/guide/word_embeddings): A guide by TensorFlow on word embeddings
1721
  - [FAISS: A Library for Efficient Similarity Search](https://github.com/facebookresearch/faiss): Facebook AI's vector similarity search library
1722
  - [Chroma: The AI-native open-source embedding database](https://www.trychroma.com/): An embedding database designed for AI applications
1723
-
1724
  ### Natural Language Processing
1725
  - [NLTK (Natural Language Toolkit)](https://www.nltk.org/): A leading platform for building Python programs to work with human language data
1726
  - [spaCy](https://spacy.io/): Industrial-strength Natural Language Processing in Python
1727
  - [Hugging Face Transformers](https://huggingface.co/transformers/): State-of-the-art Natural Language Processing for PyTorch and TensorFlow 2.0
1728
-
1729
  ### Retrieval-Augmented Generation (RAG)
1730
  - [LangChain](https://python.langchain.com/docs/get_started/introduction): A framework for developing applications powered by language models
1731
  - [OpenAI's RAG Tutorial](https://platform.openai.com/docs/tutorials/web-qa-embeddings): A guide on building a QA system with embeddings
1732
-
1733
  ### German Language Processing
1734
  - [Kölner Phonetik](https://en.wikipedia.org/wiki/Cologne_phonetics): Information about the Kölner Phonetik algorithm
1735
  - [German NLP Resources](https://github.com/adbar/German-NLP): A curated list of open-access resources for German NLP
1736
-
1737
  ### Benchmarks and Evaluation
1738
  - [MTEB Leaderboard](https://huggingface.co/spaces/mteb/leaderboard): Massive Text Embedding Benchmark leaderboard
1739
  - [GLUE Benchmark](https://gluebenchmark.com/): General Language Understanding Evaluation benchmark
1740
-
1741
  ### Tools and Libraries
1742
  - [Gensim](https://radimrehurek.com/gensim/): Topic modelling for humans
1743
  - [Sentence-Transformers](https://www.sbert.net/): A Python framework for state-of-the-art sentence, text and image embeddings
1744
-
1745
  ### Support me
1746
  - [Visual Crew Builder](https://visual-crew.builder.ai/): Tool for create AI systems, workflows and api. Or just a notebook.
1747
-
1748
-
1749
 
1750
  This tool empowers you to fine-tune your RAG system for optimal performance. Experiment with different settings, run automated tests, and use insights to create an efficient information retrieval and generation system.
1751
 
@@ -1768,7 +1791,7 @@ def create_chat_app(settings):
1768
  settings['lang'],
1769
  settings['apply_preprocessing']
1770
  )
1771
-
1772
  results, _, _, _ = search_embeddings(
1773
  chunks,
1774
  embedding_model,
@@ -1780,12 +1803,12 @@ def create_chat_app(settings):
1780
  apply_phonetic=settings['apply_phonetic'],
1781
  phonetic_weight=settings['phonetic_weight']
1782
  )
1783
-
1784
  # Generate a response based on the retrieved results
1785
  response = f"Based on the query '{message}', here are the top {settings['top_k']} relevant results:\n\n"
1786
  for i, result in enumerate(results[:settings['top_k']]):
1787
  response += f"{i+1}. {result['content'][:100]}...\n\n"
1788
-
1789
  return response
1790
 
1791
  with gr.Blocks() as chat_interface:
@@ -1823,7 +1846,7 @@ if __name__ == "__main__":
1823
  launch_interface()
1824
  # Uncomment the following line to launch the sample chat app
1825
  ´´´
1826
-
1827
  """
1828
 
1829
 
@@ -1832,10 +1855,10 @@ if __name__ == "__main__":
1832
  ["Embedding Comparison", "Tutorial", "Use Case"]
1833
  )
1834
 
1835
- iface.launch(debug=dubug)
1836
 
1837
  # Enhanced Automated Testing
1838
- def run_automated_tests(file_path: str, query: str, model_configs: List[Dict[str, str]],
1839
  test_params: Dict[str, List[Any]], expected_result: Optional[str] = None,
1840
  model_feedback: Optional[str] = None) -> Tuple[pd.DataFrame, pd.DataFrame]:
1841
  """
@@ -1844,16 +1867,16 @@ def run_automated_tests(file_path: str, query: str, model_configs: List[Dict[str
1844
  all_results = []
1845
  all_stats = []
1846
  model_manager = ModelManager()
1847
-
1848
  # Create parameter grid excluding model configurations
1849
  base_params = {k: v for k, v in test_params.items() if k not in ['model_type', 'model_name']}
1850
  param_grid = ParameterGrid(base_params)
1851
-
1852
  # Test each model configuration with all parameter combinations
1853
  for model_config in tqdm(model_configs, desc="Testing models"):
1854
  model_type = model_config['type']
1855
  model_name = model_config['name']
1856
-
1857
  for params in tqdm(param_grid, desc=f"Testing parameters for {model_type}:{model_name}"):
1858
  try:
1859
  # Process files and get chunks
@@ -1868,11 +1891,11 @@ def run_automated_tests(file_path: str, query: str, model_configs: List[Dict[str
1868
  params['lang'],
1869
  params['apply_preprocessing']
1870
  )
1871
-
1872
  # Apply vocabulary optimization if specified
1873
  if params['optimize_vocab']:
1874
  tokenizer, chunks = optimize_vocabulary(chunks)
1875
-
1876
  # Apply query optimization if specified
1877
  current_query = query
1878
  if params['use_query_optimization']:
@@ -1886,7 +1909,7 @@ def run_automated_tests(file_path: str, query: str, model_configs: List[Dict[str
1886
  params['top_k']
1887
  )
1888
  current_query = " ".join(optimized_queries)
1889
-
1890
  # Perform search
1891
  results, search_time, vector_store, raw_results = search_embeddings(
1892
  chunks,
@@ -1900,25 +1923,25 @@ def run_automated_tests(file_path: str, query: str, model_configs: List[Dict[str
1900
  params['apply_phonetic'],
1901
  params['phonetic_weight']
1902
  )
1903
-
1904
  # Apply reranking if specified
1905
  if params['use_reranking']:
1906
- reranker = pipeline("text-classification",
1907
  model="cross-encoder/ms-marco-MiniLM-L-12-v2")
1908
  raw_results = rerank_results(raw_results, current_query, reranker)
1909
-
1910
  # Calculate statistics
1911
  stats = ResultAnalyzer.calculate_statistics(
1912
  raw_results, search_time, vector_store, num_tokens,
1913
  embedding_model, current_query, params['top_k'],
1914
  expected_result, model_feedback
1915
  )
1916
-
1917
  # Update model rankings
1918
  model_id = f"{model_type}:{model_name}"
1919
  ranking_score = calculate_model_ranking_score(stats)
1920
  model_manager.update_model_ranking(model_id, ranking_score, model_feedback)
1921
-
1922
  # Add model information to stats
1923
  stats.update({
1924
  "model_type": model_type,
@@ -1926,15 +1949,15 @@ def run_automated_tests(file_path: str, query: str, model_configs: List[Dict[str
1926
  "model": f"{model_type} - {model_name}",
1927
  **params
1928
  })
1929
-
1930
  # Format and store results
1931
  all_results.extend(format_results(raw_results, stats))
1932
  all_stats.append(stats)
1933
-
1934
  except Exception as e:
1935
  print(f"Error testing {model_type}:{model_name} with parameters {params}: {str(e)}")
1936
  continue
1937
-
1938
  return pd.DataFrame(all_results), pd.DataFrame(all_stats)
1939
 
1940
  # Helper function to calculate model ranking score
@@ -1947,7 +1970,7 @@ def calculate_model_ranking_score(stats: Dict[str, Any]) -> float:
1947
  'contains_expected': 0.3,
1948
  'expected_result_rank': -0.2 # Negative weight because lower rank is better
1949
  }
1950
-
1951
  score = 0.0
1952
  for metric, weight in weights.items():
1953
  if metric in stats and not isinstance(stats[metric], str):
@@ -1958,9 +1981,8 @@ def calculate_model_ranking_score(stats: Dict[str, Any]) -> float:
1958
  else:
1959
  value = float(stats[metric])
1960
  score += weight * value
1961
-
1962
  return score
1963
 
1964
  if __name__ == "__main__":
1965
  launch_interface()
1966
-
 
41
  from typing import List, Tuple, Optional
42
 
43
 
44
+ #hf_token = os.getenv("hf_token")
45
+ #login(token=hf_token)
46
 
47
  # Define the model pipeline with additional generation parameters
48
  #model_pipeline = pipeline(
 
154
  }
155
  }
156
 
157
+
158
  def update_model_ranking(self, model_id: str, score: float, feedback: str = None):
159
  """Update model ranking based on performance and optional feedback"""
160
  current_score = self.rankings.get(model_id, 0.0)
161
  # Weighted average of current score and new score
162
  self.rankings[model_id] = 0.7 * current_score + 0.3 * score
163
+
164
  if feedback:
165
  if model_id not in self.model_stats:
166
  self.model_stats[model_id] = {"feedback_count": 0, "feedback": []}
167
  self.model_stats[model_id]["feedback_count"] += 1
168
  self.model_stats[model_id]["feedback"].append(feedback)
169
+
170
  def get_top_models(self, n: int = 5) -> List[Tuple[str, float]]:
171
  """Get top n ranked models"""
172
  return sorted(self.rankings.items(), key=lambda x: x[1], reverse=True)[:n]
173
+
174
  def get_model_stats(self, model_id: str) -> Dict[str, Any]:
175
  """Get statistics for a specific model"""
176
  return self.model_stats.get(model_id, {})
177
 
178
+
179
  def add_model(self, provider, name, model_path):
180
  if provider not in self.models:
181
  self.models[provider] = {}
 
286
  def preprocess_text(text, lang='german', apply_preprocessing=False):
287
  if not apply_preprocessing:
288
  return text
289
+
290
  text = text.lower()
291
  text = re.sub(r'[^a-zA-Z\s]', '', text)
292
+
293
  try:
294
  tokens = word_tokenize(text, language=lang)
295
  except LookupError:
296
  print(f"Warning: NLTK punkt tokenizer for {lang} not found. Using simple tokenization.")
297
  tokens = simple_tokenize(text)
298
+
299
  try:
300
  stop_words = set(stopwords.words(lang))
301
  except LookupError:
302
  print(f"Warning: Stopwords for {lang} not found. Skipping stopword removal.")
303
  stop_words = set()
304
  tokens = [token for token in tokens if token not in stop_words]
305
+
306
  try:
307
  stemmer = SnowballStemmer(lang)
308
  tokens = [stemmer.stem(token) for token in tokens]
309
  except ValueError:
310
  print(f"Warning: SnowballStemmer for {lang} not available. Skipping stemming.")
311
+
312
  return ' '.join(tokens)
313
 
314
  def phonetic_match(text, query, method='levenshtein_distance', apply_phonetic=False):
 
341
  ) -> str:
342
  """
343
  CPU-optimized version of query expansion using a small language model.
344
+
345
  Args:
346
  query: Original search query
347
  query_optimization_model: Name or path of the model to use for optimization
 
351
  search_type: Type of search being performed
352
  top_k: Number of expansion terms to add
353
  use_gpu: Whether to use GPU if available (defaults to False for CPU)
354
+
355
  Returns:
356
  Expanded query string
357
  """
358
  try:
359
  # Set device
360
  device = "cuda" if use_gpu and torch.cuda.is_available() else "cpu"
361
+
362
  # 1. Basic text preprocessing (CPU-based)
363
  tokens = word_tokenize(query.lower())
364
+
365
  # 2. WordNet synonyms expansion (CPU-based)
366
  expanded_terms = set()
367
  for token in tokens:
 
370
  for syn in synsets:
371
  # Limit number of lemmas
372
  expanded_terms.update([lemma.name() for lemma in syn.lemmas()[:2]])
373
+
374
  # 3. Use provided model with reduced complexity
375
  try:
376
  # Load model with reduced memory footprint
 
384
  low_cpu_mem_usage=True,
385
  device_map="cpu"
386
  )
387
+
388
  # Move model to CPU and eval mode
389
  model = model.to(device)
390
  model.eval()
391
+
392
  # Prepare input with reduced length
393
  prompt = f"Enhance this search query with relevant terms: {query}"
394
  inputs = tokenizer(
 
398
  truncation=True,
399
  padding=True
400
  )
401
+
402
  # Generate with minimal parameters
403
  with torch.no_grad():
404
  outputs = model.generate(
 
409
  do_sample=False,
410
  early_stopping=True
411
  )
412
+
413
  enhanced_query = tokenizer.decode(outputs[0], skip_special_tokens=True)
414
+
415
  # Clear CUDA cache if GPU was used
416
  if device == "cuda":
417
  torch.cuda.empty_cache()
418
+
419
  except Exception as model_error:
420
  print(f"Model-based expansion failed: {str(model_error)}")
421
  enhanced_query = query
422
+
423
  # 4. Combine original and expanded terms
424
  final_terms = set(tokens)
425
  final_terms.update(expanded_terms)
426
  if enhanced_query != query:
427
  final_terms.update(word_tokenize(enhanced_query.lower()))
428
+
429
  # 5. Remove stopwords and select top_k most relevant terms
430
  stopwords = set(['the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to'])
431
  final_terms = [term for term in final_terms if term not in stopwords]
432
+
433
  # Combine with original query
434
  expanded_query = f"{query} {' '.join(list(final_terms)[:top_k])}"
435
+
436
  # Clean up
437
  del model
438
  del tokenizer
439
  if device == "cuda":
440
  torch.cuda.empty_cache()
441
+
442
+ return expanded_query.strip() #[Document(page_content=expanded_query.strip())]
443
+
444
  except Exception as e:
445
  print(f"Query optimization failed: {str(e)}")
446
+ return query #[Document(page_content=query)] # Return original query if optimization fails
447
 
448
 
449
 
 
458
  use_gpu=False # Explicitly use CPU
459
  )
460
  """
461
+
462
 
463
  def create_custom_embedding(texts, model_type='word2vec', vector_size=100, window=5, min_count=1):
464
  tokenized_texts = [text.split() for text in texts]
465
+
466
  if model_type == 'word2vec':
467
  model = Word2Vec(sentences=tokenized_texts, vector_size=vector_size, window=window, min_count=min_count, workers=4)
468
  elif model_type == 'fasttext':
469
  model = FastText(sentences=tokenized_texts, vector_size=vector_size, window=window, min_count=min_count, workers=4)
470
  else:
471
  raise ValueError("Unsupported model type")
472
+
473
  return model
474
 
475
  class CustomEmbeddings(HuggingFaceEmbeddings):
476
  def __init__(self, model_path):
477
  self.model = Word2Vec.load(model_path) # or FastText.load() for FastText models
478
+
479
  def embed_documents(self, texts):
480
  return [self.model.wv[text.split()] for text in texts]
481
+
482
  def embed_query(self, text):
483
  return self.model.wv[text.split()]
484
 
 
520
  chunk_size=chunk_size,
521
  chunk_overlap=overlap_size,
522
  add_start_index=True, # If `True`, includes chunk's start index in metadata
523
+ strip_whitespace=True, # If `True`, strips whitespace from the start and end of every document
524
  separators=custom_separators or ["\n\n", "\n", " ", ""]
525
  )
526
  else:
 
534
  multi_process=True,
535
  # model_kwargs={"device": "cpu"},
536
  #encode_kwargs={"normalize_embeddings": True}, # Set `True` for cosine similarity
537
+ )
538
  elif model_type == 'OpenAI':
539
  return OpenAIEmbeddings(model=model_path)
540
  elif model_type == 'Cohere':
 
566
  phonetic_sim = phonetic_match(doc_text, query)
567
  combined_sim = (1 - phonetic_weight) * embedding_sim + phonetic_weight * phonetic_sim
568
  return combined_sim
569
+
570
  def _create_vector_store(vector_store_type, chunks_tuple, embedding_model):
571
  chunks = list(chunks_tuple)
572
+
573
  if vector_store_type == 'FAISS':
574
  return FAISS.from_texts(chunks, embedding_model)
575
  elif vector_store_type == 'Chroma':
 
587
  for file in os.listdir(FILES_DIR):
588
  file_path = os.path.join(FILES_DIR, file)
589
  text += FileHandler.extract_text(file_path)
590
+
591
  if custom_tokenizer_file:
592
  tokenizer = create_custom_tokenizer(custom_tokenizer_file, custom_tokenizer_model, custom_tokenizer_vocab_size, custom_tokenizer_special_tokens)
593
  text = ' '.join(custom_tokenize(text, tokenizer))
 
603
 
604
  def search_embeddings(chunks, embedding_model, vector_store_type, search_type, query, top_k, expected_result=None, lang='german', apply_phonetic=False, phonetic_weight=0.3):
605
  preprocessed_query = preprocess_text(query, lang) if apply_phonetic else query
606
+
607
  vector_store = get_vector_store(vector_store_type, chunks, embedding_model)
608
  retriever = get_retriever(vector_store, search_type, {"k": top_k})
609
 
 
613
  #this should be optional
614
  def score_result(doc):
615
  base_score = vector_store.similarity_search_with_score(doc.page_content, k=1)[0][1]
616
+
617
  # Add bonus for containing expected result
618
  expected_bonus = 0.3 if expected_result and expected_result in doc.page_content else 0
619
+
620
  if apply_phonetic:
621
  phonetic_score = phonetic_match(doc.page_content, query)
622
  return (1 - phonetic_weight) * base_score + phonetic_weight * phonetic_score + expected_bonus
 
645
  # Enhanced Result Analysis
646
  class ResultAnalyzer:
647
  @staticmethod
648
+ def calculate_statistics(results, search_time, vector_store, num_tokens, embedding_model, query,
649
  top_k, expected_result=None, model_feedback=None):
650
  stats = {
651
  "num_results": len(results),
 
657
  "embedding_dimension": len(embedding_model.embed_query(query)),
658
  "top_k": top_k,
659
  }
660
+
661
  # Add vector store statistics
662
  try:
663
  if hasattr(vector_store, '_index'):
 
666
  stats["vector_store_size"] = len(vector_store._collection.get())
667
  except:
668
  stats["vector_store_size"] = "N/A"
669
+
670
  # Add expected result statistics if provided
671
  if expected_result:
672
  stats["contains_expected"] = any(expected_result in doc.page_content for doc in results)
673
+ stats["expected_result_rank"] = next((i for i, doc in enumerate(results)
674
  if expected_result in doc.page_content), -1) + 1
675
+
676
  # Calculate diversity metrics for larger result sets
677
  if len(results) > 3: # Changed from 1000 to make it more practical
678
  embeddings = [embedding_model.embed_query(doc.page_content) for doc in results]
 
681
  else:
682
  stats["result_diversity"] = "N/A"
683
  stats["silhouette_score"] = "N/A"
684
+
685
  # Add ranking correlation
686
  query_embedding = embedding_model.embed_query(query)
687
  result_embeddings = [embedding_model.embed_query(doc.page_content) for doc in results]
 
691
  stats["rank_correlation"] = rank_correlation
692
  else:
693
  stats["rank_correlation"] = "N/A"
694
+
695
  # Add model feedback if provided
696
  if model_feedback:
697
  stats["model_feedback"] = model_feedback
698
+
699
  return stats
700
+
701
  @staticmethod
702
  def _calculate_diversity(embeddings: List[np.ndarray]) -> float:
703
  """Calculate diversity score for embeddings"""
704
  embeddings_array = np.array(embeddings)
705
  pairwise_similarities = np.inner(embeddings_array, embeddings_array)
706
  return 1 - np.mean(pairwise_similarities[np.triu_indices(len(embeddings), k=1)])
707
+
708
  @staticmethod
709
  def _calculate_silhouette(embeddings: List[np.ndarray]) -> float:
710
  """Calculate silhouette score for embeddings"""
 
724
  # Add model column if not present
725
  if 'model' not in stats_df.columns:
726
  stats_df['model'] = stats_df['model_type'] + ' - ' + stats_df['model_name']
727
+
728
  fig, axs = plt.subplots(2, 2, figsize=(20, 20))
729
+
730
  # Handle empty dataframe case
731
  if len(stats_df) == 0:
732
  return fig
733
+
734
  # Create plots with error handling
735
  try:
736
  sns.barplot(data=stats_df, x='model', y='search_time', ax=axs[0, 0])
 
738
  axs[0, 0].tick_params(axis='x', rotation=45)
739
  except Exception as e:
740
  print(f"Error in search time plot: {e}")
741
+
742
  try:
743
+ sns.scatterplot(data=stats_df, x='result_diversity', y='rank_correlation',
744
  hue='model', ax=axs[0, 1])
745
  axs[0, 1].set_title('Result Diversity vs. Rank Correlation')
746
  except Exception as e:
747
  print(f"Error in diversity plot: {e}")
748
+
749
  try:
750
  sns.boxplot(data=stats_df, x='model', y='avg_content_length', ax=axs[1, 0])
751
  axs[1, 0].set_title('Distribution of Result Content Lengths')
752
  axs[1, 0].tick_params(axis='x', rotation=45)
753
  except Exception as e:
754
  print(f"Error in content length plot: {e}")
755
+
756
  try:
757
  valid_embeddings = results_df['embedding'].dropna().values
758
  if len(valid_embeddings) > 1:
759
  tsne = TSNE(n_components=2, random_state=42)
760
  embeddings_2d = tsne.fit_transform(np.vstack(valid_embeddings))
761
+ sns.scatterplot(x=embeddings_2d[:, 0], y=embeddings_2d[:, 1],
762
+ hue=results_df['Model'][:len(valid_embeddings)],
763
  ax=axs[1, 1])
764
  axs[1, 1].set_title('t-SNE Visualization of Result Embeddings')
765
  else:
766
+ axs[1, 1].text(0.5, 0.5, "Not enough embeddings for visualization",
767
  ha='center', va='center')
768
  except Exception as e:
769
  print(f"Error in embedding visualization: {e}")
770
+
771
  plt.tight_layout()
772
  return fig
773
 
 
778
  #plt.title("Distribution of document lengths in the knowledge base (in count of tokens)")
779
  #plt.show()
780
 
781
+
782
  def optimize_vocabulary(texts, vocab_size=10000, min_frequency=2):
783
  tokenizer = Tokenizer(models.BPE(unk_token="[UNK]"))
784
 
785
  word_freq = Counter(word for text in texts for word in text.split())
786
+
787
  optimized_texts = [
788
  ' '.join(word for word in text.split() if word_freq[word] >= min_frequency)
789
  for text in texts
790
  ]
791
+
792
  trainer = trainers.BpeTrainer(vocab_size=vocab_size, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
793
  tokenizer.train_from_iterator(optimized_texts, trainer)
794
+
795
  return tokenizer, optimized_texts
796
+
797
  import numpy as np
798
  from transformers import TextClassificationPipeline
799
  from typing import List, Union, Any
800
 
801
+
802
 
803
  model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
804
 
805
 
806
  def rerank_results(
807
+ results: List[Any],
808
+ query: str,
809
  reranker: Union[TextClassificationPipeline, Any]
810
  ) -> List[Any]:
811
  """
812
+
813
  """
814
  if not results:
815
  return results
816
+
817
  # Step 1: Encode the query and documents using SentenceTransformer
818
  query_embedding = model.encode(query, convert_to_tensor=True)
819
  doc_contents = [doc.page_content for doc in results] # Assuming each result has a `page_content` attribute
820
  doc_embeddings = model.encode(doc_contents, convert_to_tensor=True)
821
+
822
  # Step 2: Compute cosine similarities between query and document embeddings
823
  cosine_scores = util.cos_sim(query_embedding, doc_embeddings)[0] # Shape: (number of documents,)
824
+
825
  # Step 3: Sort documents by similarity score in descending order
826
+ reranked_idx = np.argsort(cosine_scores.cpu().numpy())[::-1]
827
+
828
  # Step 4: Return the reranked documents
829
  reranked_results = [results[i] for i in reranked_idx]
830
+
831
  return reranked_results
832
 
833
 
 
878
  if optimize_vocab:
879
  tokenizer, optimized_chunks = optimize_vocabulary(chunks)
880
  chunks = optimized_chunks
881
+
882
  search_query = query
883
+
884
  if use_query_optimization:
885
  optimized_queries = optimize_query(query, query_optimization_model, chunks, embedding_model, vector_store_type, search_type, top_k)
886
  #query = " ".join(optimized_queries)
887
+ search_query = optimized_queries # " ".join([doc.page_content for doc in optimized_queries]) # Extract text from Document objects
888
 
889
  results, search_time, vector_store, results_raw = search_embeddings(
890
  chunks,
 
897
  lang,
898
  apply_phonetic,
899
  phonetic_weight
900
+ )
901
+
902
  if use_reranking:
903
  reranker = pipeline("text-classification", model="cross-encoder/ms-marco-MiniLM-L-12-v2")
904
  results_raw = rerank_results(results_raw, query, reranker)
 
953
  def automated_testing(file, query, test_params, expected_result=None):
954
  all_results = []
955
  all_stats = []
956
+
957
  param_grid = ParameterGrid(test_params)
958
  print(param_grid)
959
  for params in tqdm(param_grid, desc="Running tests"):
 
995
  params['apply_phonetic'],
996
  params['phonetic_weight']
997
  )
998
+
999
  if params['use_reranking']:
1000
  reranker = pipeline("text-classification", model="cross-encoder/ms-marco-MiniLM-L-12-v2")
1001
  results_raw = rerank_results(results_raw, query, reranker)
 
1022
  'contains_expected': 0.5, # High weight for containing the expected result
1023
  'expected_result_rank': -0.4 # Lower rank (closer to 1) is better
1024
  }
1025
+ if stats_df.empty:
1026
+ print("stats_df is empty. Cannot compute best configuration.")
1027
+ return None
1028
+
1029
  for metric in metric_weights.keys():
1030
+
1031
+ if metric in stats_df.columns:
1032
+ stats_df[metric] = pd.to_numeric(stats_df[metric], errors='coerce')
1033
+ else:
1034
+ stats_df[metric] = 0
1035
+ print("Column 'search_time' is missing in stats_df.")
1036
+
1037
+
1038
+
1039
  stats_df['weighted_score'] = sum(
1040
+ stats_df[metric].fillna(0) * weight
1041
  for metric, weight in metric_weights.items()
1042
  )
1043
+
1044
  best_config = stats_df.loc[stats_df['weighted_score'].idxmax()]
1045
+
1046
  recommendations = {
1047
  'best_model': f"{best_config['model_type']} - {best_config['model_name']}",
1048
  'best_settings': {
 
1069
  'expected_result_rank': int(best_config['expected_result_rank'])
1070
  }
1071
  }
1072
+
1073
  return recommendations
1074
 
1075
  ####
 
1079
  return {"error": "No file uploaded"}
1080
 
1081
  chunks, _, _ = process_files(
1082
+ file.name,
1083
+ 'HuggingFace',
1084
+ 'paraphrase-miniLM',
1085
+ 'recursive',
1086
+ 250,
1087
  50,
1088
  custom_separators=None
1089
  )
1090
+
1091
  # Select a few random chunks
1092
  sample_chunks = random.sample(chunks, min(num_chunks, len(chunks)))
1093
+
1094
+
1095
+ llm_pipeline = pipeline(model="meta-llama/Llama-3.2-1B-Instruct", device='cuda')
1096
+
1097
+
1098
+ prompt=f'''
1099
+ <|start_header_id|>system<|end_header_id|>
1100
+ You are an expert in information retrieval.
1101
+ You know about strenghs and weaknesses of all models.
1102
+
1103
+ Given the following text chunks from a document,
1104
+ suggest optimal settings for an embedding-based search system. The settings should include:
1105
+
1106
+ 1. Embedding model type and name
1107
+ 2. Split strategy (token or recursive)
1108
+ 3. Chunk size
1109
+ 4. Overlap size
1110
+ 5. Vector store type (FAISS or Chroma)
1111
+ 6. Search type (similarity, mmr, or custom)
1112
+ 7. Top K results to retrieve
1113
+ 8. Whether to apply preprocessing
1114
+ 9. Whether to optimize vocabulary
1115
+ 10. Whether to apply phonetic matching
1116
+
1117
+ Expected output format:
1118
+ {{
1119
+ "embedding_models": "embedding_model_type:embedding_model_name",
1120
+ "split_strategy": "token or recursive",
1121
+ "chunk_size": 250,
1122
+ "overlap_size": 50,
1123
+ "vector_store_type": "FAISS or Chroma",
1124
+ "search_type": "similarity, mmr, or custom",
1125
+ "top_k": 5,
1126
+ "apply_preprocessing": True,
1127
+ "optimize_vocab": True,
1128
+ "apply_phonetic": False,
1129
+ "phonetic_weight": 0.3 #
1130
+ }}
1131
+
1132
+ Provide your suggestions in a Python dictionary format.
1133
+
1134
+ show me settings You SHOULD NOT include any other text in the response.
1135
+ Fill out the seeting and chose usefull values.
1136
+ Respect the users use cases and content snipet. Choose the setting based on the chunks
1137
+
1138
+ <|eot_id|><|start_header_id|>user<|end_header_id|>
1139
+ User user case:
1140
+ {"small local", "large total context", ...}
1141
+
1142
+ total content lenght:
1143
+ {len(' '.join(chunks))}
1144
+
1145
+ Content snipet:
1146
+ {' '.join(sample_chunks)}
1147
+ <|eot_id|><|start_header_id|>assistant<|end_header_id|>
1148
+ '''
1149
+ suggested_settings = llm_pipeline(
1150
+ prompt,
1151
+ do_sample=True,
1152
+ top_k=10,
1153
+ num_return_sequences=1,
1154
+ return_full_text=False,
1155
+ max_new_tokens=1900, # Control the length of the output,
1156
+ truncation=True, # Enable truncation
1157
+ )
1158
+
1159
+
1160
+ #suggested_settings = llm.invoke(prompt)
1161
  print("setting suggested")
1162
  print(suggested_settings)
1163
  # Parse the generated text to extract the dictionary
 
1183
  def update_inputs_with_llm_suggestions(suggestions):
1184
  if suggestions is None or "error" in suggestions:
1185
  return [gr.update() for _ in range(11)] # Return no updates if there's an error or None
1186
+
1187
  return [
1188
  gr.update(value=[suggestions["embedding_models"]]), # embedding_models_input
1189
  gr.update(value=suggestions["split_strategy"]), # split_strategy_input
 
1201
  def parse_model_selections(default_models, custom_models):
1202
  """
1203
  Parse selected default models and custom models into model configurations
1204
+
1205
  Args:
1206
  default_models (List[str]): Selected default models in format "type:name"
1207
  custom_models (str): Custom models string with one model per line in format "type:name"
1208
+
1209
  Returns:
1210
  List[Dict[str, str]]: List of model configurations with 'type' and 'name' keys
1211
  """
1212
  model_configs = []
1213
+
1214
  # Process default models
1215
  if default_models:
1216
  for model in default_models:
 
1219
  'type': model_type,
1220
  'name': model_name
1221
  })
1222
+
1223
  # Process custom models
1224
  if custom_models:
1225
  custom_model_lines = custom_models.strip().split('\n')
 
1230
  'type': model_type.strip(),
1231
  'name': model_name.strip()
1232
  })
1233
+
1234
  return model_configs
1235
 
1236
  def parse_comma_separated(text):
 
1240
  return [x.strip() for x in text.split(',') if x.strip()]
1241
 
1242
 
1243
+
1244
  # Gradio Interface
1245
  def launch_interface(debug=True):
1246
  with gr.Blocks() as iface:
1247
  gr.Markdown("# Advanced Embedding Comparison Tool")
1248
+
1249
  with gr.Tab("Simple"):
1250
  file_input = gr.File(label="Upload File (Optional)")
1251
  query_input = gr.Textbox(label="Search Query")
 
1260
  label="Embedding Models"
1261
  )
1262
  top_k_input = gr.Slider(1, 10, step=1, value=5, label="Top K")
1263
+
1264
  with gr.Tab("Advanced"):
1265
  custom_embedding_model_input = gr.Textbox(label="Custom Embedding Model (optional, format: type:name)")
1266
  split_strategy_input = gr.Radio(choices=["token", "recursive"], label="Split Strategy", value="recursive")
 
1270
  vector_store_type_input = gr.Radio(choices=["FAISS", "Chroma"], label="Vector Store Type", value="FAISS")
1271
  search_type_input = gr.Radio(choices=["similarity", "mmr", "custom"], label="Search Type", value="similarity")
1272
  lang_input = gr.Dropdown(choices=["german", "english", "french"], label="Language", value="german")
1273
+
1274
  with gr.Tab("Expert"):
1275
  apply_preprocessing_input = gr.Checkbox(label="Apply Text Preprocessing", value=False)
1276
  optimize_vocab_input = gr.Checkbox(label="Optimize Vocabulary", value=False)
 
1288
  with gr.Row():
1289
  auto_file_input = gr.File(label="Upload File (Optional)")
1290
  auto_query_input = gr.Textbox(label="Search Query")
1291
+
1292
  with gr.Row():
1293
  auto_expected_result_input = gr.Textbox(
1294
  label="Expected Result (Optional)",
 
1298
  label="Model Feedback (Optional)",
1299
  placeholder="Enter any feedback about model performance"
1300
  )
1301
+
1302
  with gr.Row():
1303
  with gr.Column():
1304
  # Default model selection
1305
  default_models_input = gr.CheckboxGroup(
1306
+ choices=[f"{type}:{name}"
1307
+ for type, names in DEFAULT_MODELS.items()
1308
  for name in names],
1309
  label="Default Models",
1310
  value=[f"HuggingFace:{DEFAULT_MODELS['HuggingFace'][0]}"]
1311
  )
1312
+
1313
  with gr.Column():
1314
  # Custom model input
1315
  custom_models_input = gr.TextArea(
 
1317
  placeholder="Enter one model per line in format: type:name",
1318
  lines=3
1319
  )
1320
+
1321
  auto_split_strategies = gr.CheckboxGroup(
1322
  choices=["token", "recursive"],
1323
  label="Split Strategies to Test"
 
1336
  auto_optimize_vocab = gr.Checkbox(label="Test Vocabulary Optimization", value=True)
1337
  auto_use_query_optimization = gr.Checkbox(label="Test Query Optimization", value=True)
1338
  auto_use_reranking = gr.Checkbox(label="Test Reranking", value=True)
1339
+
1340
  auto_results_output = gr.Dataframe(label="Automated Test Results", interactive=False)
1341
  auto_stats_output = gr.Dataframe(label="Automated Test Statistics", interactive=False)
1342
  recommendations_output = gr.JSON(label="Recommendations")
1343
+
1344
  def run_automation(file_input, query_input, expected_result, default_models, custom_models,
1345
  split_strategies, chunk_sizes, overlap_sizes,
1346
  vector_store_types, search_types, top_k_values,
1347
  optimize_vocab, use_query_optimization, use_reranking,
1348
  model_feedback):
1349
  """Wrapper function to handle Gradio inputs and run automated tests"""
1350
+
1351
  # Parse model configurations
1352
  model_configs = parse_model_selections(default_models, custom_models)
1353
+
1354
  # Parse test parameters
1355
  test_params = {
1356
  'split_strategy': split_strategies,
 
1369
  'custom_separators': [None],
1370
  'query_optimization_model': ['google/flan-t5-base'] # Default query optimization model
1371
  }
1372
+
1373
  # Run automated tests
1374
  results_df, stats_df = run_automated_tests(
1375
  file_input.name if file_input else None,
 
1379
  expected_result if expected_result else None,
1380
  model_feedback if model_feedback else None
1381
  )
1382
+
1383
  # Generate recommendations based on results
1384
  recommendations = analyze_results(stats_df)
1385
+
1386
  return results_df, stats_df, recommendations
1387
+
1388
  auto_submit_button = gr.Button("Run Automated Tests")
1389
  auto_submit_button.click(
1390
  fn=run_automation,
 
1399
  outputs=[auto_results_output, auto_stats_output, recommendations_output]
1400
  )
1401
  ###
1402
+
1403
  with gr.Tab("Results"):
1404
  with gr.Row():
1405
  results_output = gr.DataFrame(label="Results")
1406
  stats_output = gr.DataFrame(label="Statistics")
1407
+
1408
  with gr.Row():
1409
  plot_output = gr.Plot(label="Visualizations")
1410
  model_rankings_output = gr.JSON(label="Model Rankings")
1411
+
1412
  with gr.Row():
1413
  recommendations_output = gr.JSON(label="Recommendations")
1414
+
1415
  with gr.Tab("LLM Suggestions"):
1416
  llm_file_input = gr.File(label="Upload File for LLM Suggestions")
1417
  llm_num_chunks = gr.Slider(1, 10, step=1, value=5, label="Number of Sample Chunks")
1418
  llm_suggest_button = gr.Button("Get LLM Suggestions")
1419
  llm_suggestions_output = gr.JSON(label="LLM-suggested Settings")
1420
+
1421
  llm_suggest_button.click(
1422
  fn=get_llm_suggested_settings,
1423
  inputs=[llm_file_input, llm_num_chunks],
 
1426
  fn=update_inputs_with_llm_suggestions,
1427
  inputs=[llm_suggestions_output],
1428
  outputs=[
1429
+ embedding_models_input, split_strategy_input, chunk_size_input,
1430
+ overlap_size_input, vector_store_type_input, search_type_input,
1431
+ top_k_input, apply_preprocessing_input, optimize_vocab_input,
1432
  apply_phonetic_input, phonetic_weight_input
1433
  ]
1434
  )
 
1549
  User: "Was sind die Hauptziele des KI-Gesetzes?"
1550
  """
1551
 
1552
+
1553
  tutorial_md = """
1554
  # Advanced Embedding Comparison Tool Tutorial
1555
 
 
1698
  def create_custom_tokenizer(file_path, model_type='WordLevel', vocab_size=10000, special_tokens=None):
1699
  with open(file_path, 'r', encoding='utf-8') as f:
1700
  text = f.read()
1701
+
1702
  tokenizer = Tokenizer(models.WordLevel(unk_token="[UNK]")) if model_type == 'WordLevel' else Tokenizer(models.BPE(unk_token="[UNK]"))
1703
  tokenizer.pre_tokenizer = Whitespace()
1704
+
1705
  trainer = trainers.WordLevelTrainer(special_tokens=special_tokens or ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"], vocab_size=vocab_size)
1706
  tokenizer.train_from_iterator([text], trainer)
1707
+
1708
  return tokenizer
1709
  ````
1710
 
 
1736
 
1737
 
1738
  ## Useful Resources and Links
1739
+
1740
  Here are some valuable resources to help you better understand and work with embeddings, retrieval systems, and natural language processing:
1741
+
1742
  ### Embeddings and Vector Databases
1743
  - [Understanding Embeddings](https://www.tensorflow.org/text/guide/word_embeddings): A guide by TensorFlow on word embeddings
1744
  - [FAISS: A Library for Efficient Similarity Search](https://github.com/facebookresearch/faiss): Facebook AI's vector similarity search library
1745
  - [Chroma: The AI-native open-source embedding database](https://www.trychroma.com/): An embedding database designed for AI applications
1746
+
1747
  ### Natural Language Processing
1748
  - [NLTK (Natural Language Toolkit)](https://www.nltk.org/): A leading platform for building Python programs to work with human language data
1749
  - [spaCy](https://spacy.io/): Industrial-strength Natural Language Processing in Python
1750
  - [Hugging Face Transformers](https://huggingface.co/transformers/): State-of-the-art Natural Language Processing for PyTorch and TensorFlow 2.0
1751
+
1752
  ### Retrieval-Augmented Generation (RAG)
1753
  - [LangChain](https://python.langchain.com/docs/get_started/introduction): A framework for developing applications powered by language models
1754
  - [OpenAI's RAG Tutorial](https://platform.openai.com/docs/tutorials/web-qa-embeddings): A guide on building a QA system with embeddings
1755
+
1756
  ### German Language Processing
1757
  - [Kölner Phonetik](https://en.wikipedia.org/wiki/Cologne_phonetics): Information about the Kölner Phonetik algorithm
1758
  - [German NLP Resources](https://github.com/adbar/German-NLP): A curated list of open-access resources for German NLP
1759
+
1760
  ### Benchmarks and Evaluation
1761
  - [MTEB Leaderboard](https://huggingface.co/spaces/mteb/leaderboard): Massive Text Embedding Benchmark leaderboard
1762
  - [GLUE Benchmark](https://gluebenchmark.com/): General Language Understanding Evaluation benchmark
1763
+
1764
  ### Tools and Libraries
1765
  - [Gensim](https://radimrehurek.com/gensim/): Topic modelling for humans
1766
  - [Sentence-Transformers](https://www.sbert.net/): A Python framework for state-of-the-art sentence, text and image embeddings
1767
+
1768
  ### Support me
1769
  - [Visual Crew Builder](https://visual-crew.builder.ai/): Tool for create AI systems, workflows and api. Or just a notebook.
1770
+
1771
+
1772
 
1773
  This tool empowers you to fine-tune your RAG system for optimal performance. Experiment with different settings, run automated tests, and use insights to create an efficient information retrieval and generation system.
1774
 
 
1791
  settings['lang'],
1792
  settings['apply_preprocessing']
1793
  )
1794
+
1795
  results, _, _, _ = search_embeddings(
1796
  chunks,
1797
  embedding_model,
 
1803
  apply_phonetic=settings['apply_phonetic'],
1804
  phonetic_weight=settings['phonetic_weight']
1805
  )
1806
+
1807
  # Generate a response based on the retrieved results
1808
  response = f"Based on the query '{message}', here are the top {settings['top_k']} relevant results:\n\n"
1809
  for i, result in enumerate(results[:settings['top_k']]):
1810
  response += f"{i+1}. {result['content'][:100]}...\n\n"
1811
+
1812
  return response
1813
 
1814
  with gr.Blocks() as chat_interface:
 
1846
  launch_interface()
1847
  # Uncomment the following line to launch the sample chat app
1848
  ´´´
1849
+
1850
  """
1851
 
1852
 
 
1855
  ["Embedding Comparison", "Tutorial", "Use Case"]
1856
  )
1857
 
1858
+ iface.launch(debug=True, share=True)
1859
 
1860
  # Enhanced Automated Testing
1861
+ def run_automated_tests(file_path: str, query: str, model_configs: List[Dict[str, str]],
1862
  test_params: Dict[str, List[Any]], expected_result: Optional[str] = None,
1863
  model_feedback: Optional[str] = None) -> Tuple[pd.DataFrame, pd.DataFrame]:
1864
  """
 
1867
  all_results = []
1868
  all_stats = []
1869
  model_manager = ModelManager()
1870
+
1871
  # Create parameter grid excluding model configurations
1872
  base_params = {k: v for k, v in test_params.items() if k not in ['model_type', 'model_name']}
1873
  param_grid = ParameterGrid(base_params)
1874
+
1875
  # Test each model configuration with all parameter combinations
1876
  for model_config in tqdm(model_configs, desc="Testing models"):
1877
  model_type = model_config['type']
1878
  model_name = model_config['name']
1879
+
1880
  for params in tqdm(param_grid, desc=f"Testing parameters for {model_type}:{model_name}"):
1881
  try:
1882
  # Process files and get chunks
 
1891
  params['lang'],
1892
  params['apply_preprocessing']
1893
  )
1894
+
1895
  # Apply vocabulary optimization if specified
1896
  if params['optimize_vocab']:
1897
  tokenizer, chunks = optimize_vocabulary(chunks)
1898
+
1899
  # Apply query optimization if specified
1900
  current_query = query
1901
  if params['use_query_optimization']:
 
1909
  params['top_k']
1910
  )
1911
  current_query = " ".join(optimized_queries)
1912
+
1913
  # Perform search
1914
  results, search_time, vector_store, raw_results = search_embeddings(
1915
  chunks,
 
1923
  params['apply_phonetic'],
1924
  params['phonetic_weight']
1925
  )
1926
+
1927
  # Apply reranking if specified
1928
  if params['use_reranking']:
1929
+ reranker = pipeline("text-classification",
1930
  model="cross-encoder/ms-marco-MiniLM-L-12-v2")
1931
  raw_results = rerank_results(raw_results, current_query, reranker)
1932
+
1933
  # Calculate statistics
1934
  stats = ResultAnalyzer.calculate_statistics(
1935
  raw_results, search_time, vector_store, num_tokens,
1936
  embedding_model, current_query, params['top_k'],
1937
  expected_result, model_feedback
1938
  )
1939
+
1940
  # Update model rankings
1941
  model_id = f"{model_type}:{model_name}"
1942
  ranking_score = calculate_model_ranking_score(stats)
1943
  model_manager.update_model_ranking(model_id, ranking_score, model_feedback)
1944
+
1945
  # Add model information to stats
1946
  stats.update({
1947
  "model_type": model_type,
 
1949
  "model": f"{model_type} - {model_name}",
1950
  **params
1951
  })
1952
+
1953
  # Format and store results
1954
  all_results.extend(format_results(raw_results, stats))
1955
  all_stats.append(stats)
1956
+
1957
  except Exception as e:
1958
  print(f"Error testing {model_type}:{model_name} with parameters {params}: {str(e)}")
1959
  continue
1960
+
1961
  return pd.DataFrame(all_results), pd.DataFrame(all_stats)
1962
 
1963
  # Helper function to calculate model ranking score
 
1970
  'contains_expected': 0.3,
1971
  'expected_result_rank': -0.2 # Negative weight because lower rank is better
1972
  }
1973
+
1974
  score = 0.0
1975
  for metric, weight in weights.items():
1976
  if metric in stats and not isinstance(stats[metric], str):
 
1981
  else:
1982
  value = float(stats[metric])
1983
  score += weight * value
1984
+
1985
  return score
1986
 
1987
  if __name__ == "__main__":
1988
  launch_interface()