More_Advanced_Embeddings_Comparator

Running

App Files Files Community

Chris4K commited on Oct 25

Commit

aa72e55

•

1 Parent(s): 97c3e76

Update app.py

Browse files

Files changed (1) hide show

app.py +246 -224

app.py CHANGED Viewed

@@ -41,8 +41,8 @@ from huggingface_hub import login
 from typing import List, Tuple, Optional
-hf_token = os.getenv("hf_token")
-login(token=hf_token)
 # Define the model pipeline with additional generation parameters
 #model_pipeline = pipeline(
@@ -154,28 +154,28 @@ class ModelManager:
             }
         }
     def update_model_ranking(self, model_id: str, score: float, feedback: str = None):
         """Update model ranking based on performance and optional feedback"""
         current_score = self.rankings.get(model_id, 0.0)
         # Weighted average of current score and new score
         self.rankings[model_id] = 0.7 * current_score + 0.3 * score
         if feedback:
             if model_id not in self.model_stats:
                 self.model_stats[model_id] = {"feedback_count": 0, "feedback": []}
             self.model_stats[model_id]["feedback_count"] += 1
             self.model_stats[model_id]["feedback"].append(feedback)
     def get_top_models(self, n: int = 5) -> List[Tuple[str, float]]:
         """Get top n ranked models"""
         return sorted(self.rankings.items(), key=lambda x: x[1], reverse=True)[:n]
     def get_model_stats(self, model_id: str) -> Dict[str, Any]:
         """Get statistics for a specific model"""
         return self.model_stats.get(model_id, {})
     def add_model(self, provider, name, model_path):
         if provider not in self.models:
             self.models[provider] = {}
@@ -286,29 +286,29 @@ def simple_tokenize(text):
 def preprocess_text(text, lang='german', apply_preprocessing=False):
     if not apply_preprocessing:
         return text
     text = text.lower()
     text = re.sub(r'[^a-zA-Z\s]', '', text)
     try:
         tokens = word_tokenize(text, language=lang)
     except LookupError:
         print(f"Warning: NLTK punkt tokenizer for {lang} not found. Using simple tokenization.")
         tokens = simple_tokenize(text)
     try:
         stop_words = set(stopwords.words(lang))
     except LookupError:
         print(f"Warning: Stopwords for {lang} not found. Skipping stopword removal.")
         stop_words = set()
     tokens = [token for token in tokens if token not in stop_words]
     try:
         stemmer = SnowballStemmer(lang)
         tokens = [stemmer.stem(token) for token in tokens]
     except ValueError:
         print(f"Warning: SnowballStemmer for {lang} not available. Skipping stemming.")
     return ' '.join(tokens)
 def phonetic_match(text, query, method='levenshtein_distance', apply_phonetic=False):
@@ -341,7 +341,7 @@ def optimize_query(
 ) -> str:
     """
     CPU-optimized version of query expansion using a small language model.
     Args:
         query: Original search query
         query_optimization_model: Name or path of the model to use for optimization
@@ -351,17 +351,17 @@ def optimize_query(
         search_type: Type of search being performed
         top_k: Number of expansion terms to add
         use_gpu: Whether to use GPU if available (defaults to False for CPU)
     Returns:
         Expanded query string
     """
     try:
         # Set device
         device = "cuda" if use_gpu and torch.cuda.is_available() else "cpu"
         # 1. Basic text preprocessing (CPU-based)
         tokens = word_tokenize(query.lower())
         # 2. WordNet synonyms expansion (CPU-based)
         expanded_terms = set()
         for token in tokens:
@@ -370,7 +370,7 @@ def optimize_query(
             for syn in synsets:
                 # Limit number of lemmas
                 expanded_terms.update([lemma.name() for lemma in syn.lemmas()[:2]])
         # 3. Use provided model with reduced complexity
         try:
             # Load model with reduced memory footprint
@@ -384,11 +384,11 @@ def optimize_query(
                 low_cpu_mem_usage=True,
                 device_map="cpu"
             )
             # Move model to CPU and eval mode
             model = model.to(device)
             model.eval()
             # Prepare input with reduced length
             prompt = f"Enhance this search query with relevant terms: {query}"
             inputs = tokenizer(
@@ -398,7 +398,7 @@ def optimize_query(
                 truncation=True,
                 padding=True
             )
             # Generate with minimal parameters
             with torch.no_grad():
                 outputs = model.generate(
@@ -409,41 +409,41 @@ def optimize_query(
                     do_sample=False,
                     early_stopping=True
                 )
             enhanced_query = tokenizer.decode(outputs[0], skip_special_tokens=True)
             # Clear CUDA cache if GPU was used
             if device == "cuda":
                 torch.cuda.empty_cache()
         except Exception as model_error:
             print(f"Model-based expansion failed: {str(model_error)}")
             enhanced_query = query
         # 4. Combine original and expanded terms
         final_terms = set(tokens)
         final_terms.update(expanded_terms)
         if enhanced_query != query:
             final_terms.update(word_tokenize(enhanced_query.lower()))
         # 5. Remove stopwords and select top_k most relevant terms
         stopwords = set(['the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to'])
         final_terms = [term for term in final_terms if term not in stopwords]
         # Combine with original query
         expanded_query = f"{query} {' '.join(list(final_terms)[:top_k])}"
         # Clean up
         del model
         del tokenizer
         if device == "cuda":
             torch.cuda.empty_cache()
-        return [Document(page_content=expanded_query.strip())]
     except Exception as e:
         print(f"Query optimization failed: {str(e)}")
-        return [Document(page_content=query)]  # Return original query if optimization fails
@@ -458,27 +458,27 @@ optimized_query = optimize_query(
     use_gpu=False  # Explicitly use CPU
 )
 """
 def create_custom_embedding(texts, model_type='word2vec', vector_size=100, window=5, min_count=1):
     tokenized_texts = [text.split() for text in texts]
     if model_type == 'word2vec':
         model = Word2Vec(sentences=tokenized_texts, vector_size=vector_size, window=window, min_count=min_count, workers=4)
     elif model_type == 'fasttext':
         model = FastText(sentences=tokenized_texts, vector_size=vector_size, window=window, min_count=min_count, workers=4)
     else:
         raise ValueError("Unsupported model type")
     return model
 class CustomEmbeddings(HuggingFaceEmbeddings):
     def __init__(self, model_path):
         self.model = Word2Vec.load(model_path)  # or FastText.load() for FastText models
     def embed_documents(self, texts):
         return [self.model.wv[text.split()] for text in texts]
     def embed_query(self, text):
         return self.model.wv[text.split()]
@@ -520,7 +520,7 @@ def get_text_splitter(split_strategy, chunk_size, overlap_size, custom_separator
             chunk_size=chunk_size,
             chunk_overlap=overlap_size,
             add_start_index=True,  # If `True`, includes chunk's start index in metadata
-            strip_whitespace=True,  # If `True`, strips whitespace from the start and end of every document
             separators=custom_separators or ["\n\n", "\n", " ", ""]
         )
     else:
@@ -534,7 +534,7 @@ def get_embedding_model(model_type, model_name):
             multi_process=True,
            # model_kwargs={"device": "cpu"},
             #encode_kwargs={"normalize_embeddings": True},  # Set `True` for cosine similarity
-        )
     elif model_type == 'OpenAI':
         return OpenAIEmbeddings(model=model_path)
     elif model_type == 'Cohere':
@@ -566,10 +566,10 @@ def custom_similarity(query_embedding, doc_embedding, query, doc_text, phonetic_
     phonetic_sim = phonetic_match(doc_text, query)
     combined_sim = (1 - phonetic_weight) * embedding_sim + phonetic_weight * phonetic_sim
     return combined_sim
 def _create_vector_store(vector_store_type, chunks_tuple, embedding_model):
     chunks = list(chunks_tuple)
     if vector_store_type == 'FAISS':
         return FAISS.from_texts(chunks, embedding_model)
     elif vector_store_type == 'Chroma':
@@ -587,7 +587,7 @@ def process_files(file_path, model_type, model_name, split_strategy, chunk_size,
         for file in os.listdir(FILES_DIR):
             file_path = os.path.join(FILES_DIR, file)
             text += FileHandler.extract_text(file_path)
     if custom_tokenizer_file:
         tokenizer = create_custom_tokenizer(custom_tokenizer_file, custom_tokenizer_model, custom_tokenizer_vocab_size, custom_tokenizer_special_tokens)
         text = ' '.join(custom_tokenize(text, tokenizer))
@@ -603,7 +603,7 @@ def process_files(file_path, model_type, model_name, split_strategy, chunk_size,
 def search_embeddings(chunks, embedding_model, vector_store_type, search_type, query, top_k, expected_result=None, lang='german', apply_phonetic=False, phonetic_weight=0.3):
     preprocessed_query = preprocess_text(query, lang) if apply_phonetic else query
     vector_store = get_vector_store(vector_store_type, chunks, embedding_model)
     retriever = get_retriever(vector_store, search_type, {"k": top_k})
@@ -613,10 +613,10 @@ def search_embeddings(chunks, embedding_model, vector_store_type, search_type, q
     #this should be optional
     def score_result(doc):
         base_score = vector_store.similarity_search_with_score(doc.page_content, k=1)[0][1]
         # Add bonus for containing expected result
         expected_bonus = 0.3 if expected_result and expected_result in doc.page_content else 0
         if apply_phonetic:
             phonetic_score = phonetic_match(doc.page_content, query)
             return (1 - phonetic_weight) * base_score + phonetic_weight * phonetic_score + expected_bonus
@@ -645,7 +645,7 @@ def search_embeddings(chunks, embedding_model, vector_store_type, search_type, q
 # Enhanced Result Analysis
 class ResultAnalyzer:
     @staticmethod
-    def calculate_statistics(results, search_time, vector_store, num_tokens, embedding_model, query,
                            top_k, expected_result=None, model_feedback=None):
         stats = {
             "num_results": len(results),
@@ -657,7 +657,7 @@ class ResultAnalyzer:
             "embedding_dimension": len(embedding_model.embed_query(query)),
             "top_k": top_k,
         }
         # Add vector store statistics
         try:
             if hasattr(vector_store, '_index'):
@@ -666,13 +666,13 @@ class ResultAnalyzer:
                 stats["vector_store_size"] = len(vector_store._collection.get())
         except:
             stats["vector_store_size"] = "N/A"
         # Add expected result statistics if provided
         if expected_result:
             stats["contains_expected"] = any(expected_result in doc.page_content for doc in results)
-            stats["expected_result_rank"] = next((i for i, doc in enumerate(results)
                                                 if expected_result in doc.page_content), -1) + 1
         # Calculate diversity metrics for larger result sets
         if len(results) > 3:  # Changed from 1000 to make it more practical
             embeddings = [embedding_model.embed_query(doc.page_content) for doc in results]
@@ -681,7 +681,7 @@ class ResultAnalyzer:
         else:
             stats["result_diversity"] = "N/A"
             stats["silhouette_score"] = "N/A"
         # Add ranking correlation
         query_embedding = embedding_model.embed_query(query)
         result_embeddings = [embedding_model.embed_query(doc.page_content) for doc in results]
@@ -691,20 +691,20 @@ class ResultAnalyzer:
             stats["rank_correlation"] = rank_correlation
         else:
             stats["rank_correlation"] = "N/A"
         # Add model feedback if provided
         if model_feedback:
             stats["model_feedback"] = model_feedback
         return stats
     @staticmethod
     def _calculate_diversity(embeddings: List[np.ndarray]) -> float:
         """Calculate diversity score for embeddings"""
         embeddings_array = np.array(embeddings)
         pairwise_similarities = np.inner(embeddings_array, embeddings_array)
         return 1 - np.mean(pairwise_similarities[np.triu_indices(len(embeddings), k=1)])
     @staticmethod
     def _calculate_silhouette(embeddings: List[np.ndarray]) -> float:
         """Calculate silhouette score for embeddings"""
@@ -724,13 +724,13 @@ def visualize_results(results_df, stats_df):
     # Add model column if not present
     if 'model' not in stats_df.columns:
         stats_df['model'] = stats_df['model_type'] + ' - ' + stats_df['model_name']
     fig, axs = plt.subplots(2, 2, figsize=(20, 20))
     # Handle empty dataframe case
     if len(stats_df) == 0:
         return fig
     # Create plots with error handling
     try:
         sns.barplot(data=stats_df, x='model', y='search_time', ax=axs[0, 0])
@@ -738,36 +738,36 @@ def visualize_results(results_df, stats_df):
         axs[0, 0].tick_params(axis='x', rotation=45)
     except Exception as e:
         print(f"Error in search time plot: {e}")
     try:
-        sns.scatterplot(data=stats_df, x='result_diversity', y='rank_correlation',
                        hue='model', ax=axs[0, 1])
         axs[0, 1].set_title('Result Diversity vs. Rank Correlation')
     except Exception as e:
         print(f"Error in diversity plot: {e}")
     try:
         sns.boxplot(data=stats_df, x='model', y='avg_content_length', ax=axs[1, 0])
         axs[1, 0].set_title('Distribution of Result Content Lengths')
         axs[1, 0].tick_params(axis='x', rotation=45)
     except Exception as e:
         print(f"Error in content length plot: {e}")
     try:
         valid_embeddings = results_df['embedding'].dropna().values
         if len(valid_embeddings) > 1:
             tsne = TSNE(n_components=2, random_state=42)
             embeddings_2d = tsne.fit_transform(np.vstack(valid_embeddings))
-            sns.scatterplot(x=embeddings_2d[:, 0], y=embeddings_2d[:, 1],
-                          hue=results_df['Model'][:len(valid_embeddings)],
                           ax=axs[1, 1])
             axs[1, 1].set_title('t-SNE Visualization of Result Embeddings')
         else:
-            axs[1, 1].text(0.5, 0.5, "Not enough embeddings for visualization",
                           ha='center', va='center')
     except Exception as e:
         print(f"Error in embedding visualization: {e}")
     plt.tight_layout()
     return fig
@@ -778,56 +778,56 @@ def visualize_results(results_df, stats_df):
 #plt.title("Distribution of document lengths in the knowledge base (in count of tokens)")
 #plt.show()
 def optimize_vocabulary(texts, vocab_size=10000, min_frequency=2):
     tokenizer = Tokenizer(models.BPE(unk_token="[UNK]"))
     word_freq = Counter(word for text in texts for word in text.split())
     optimized_texts = [
         ' '.join(word for word in text.split() if word_freq[word] >= min_frequency)
         for text in texts
     ]
     trainer = trainers.BpeTrainer(vocab_size=vocab_size, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
     tokenizer.train_from_iterator(optimized_texts, trainer)
     return tokenizer, optimized_texts
 import numpy as np
 from transformers import TextClassificationPipeline
 from typing import List, Union, Any
 model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
 def rerank_results(
-    results: List[Any],
-    query: str,
     reranker: Union[TextClassificationPipeline, Any]
 ) -> List[Any]:
     """
     """
     if not results:
         return results
     # Step 1: Encode the query and documents using SentenceTransformer
     query_embedding = model.encode(query, convert_to_tensor=True)
     doc_contents = [doc.page_content for doc in results]  # Assuming each result has a `page_content` attribute
     doc_embeddings = model.encode(doc_contents, convert_to_tensor=True)
     # Step 2: Compute cosine similarities between query and document embeddings
     cosine_scores = util.cos_sim(query_embedding, doc_embeddings)[0]  # Shape: (number of documents,)
     # Step 3: Sort documents by similarity score in descending order
-    reranked_idx = np.argsort(cosine_scores.numpy())[::-1]
     # Step 4: Return the reranked documents
     reranked_results = [results[i] for i in reranked_idx]
     return reranked_results
@@ -878,13 +878,13 @@ def compare_embeddings(file, query, embedding_models, custom_embedding_model, sp
         if optimize_vocab:
             tokenizer, optimized_chunks = optimize_vocabulary(chunks)
             chunks = optimized_chunks
         search_query = query
         if use_query_optimization:
             optimized_queries = optimize_query(query, query_optimization_model, chunks, embedding_model, vector_store_type, search_type, top_k)
             #query = " ".join(optimized_queries)
-            search_query = " ".join([doc.page_content for doc in optimized_queries])  # Extract text from Document objects
         results, search_time, vector_store, results_raw = search_embeddings(
             chunks,
@@ -897,8 +897,8 @@ def compare_embeddings(file, query, embedding_models, custom_embedding_model, sp
             lang,
             apply_phonetic,
             phonetic_weight
-        )
         if use_reranking:
             reranker = pipeline("text-classification", model="cross-encoder/ms-marco-MiniLM-L-12-v2")
             results_raw = rerank_results(results_raw, query, reranker)
@@ -953,7 +953,7 @@ from tqdm import tqdm
 def automated_testing(file, query, test_params, expected_result=None):
     all_results = []
     all_stats = []
     param_grid = ParameterGrid(test_params)
     print(param_grid)
     for params in tqdm(param_grid, desc="Running tests"):
@@ -995,7 +995,7 @@ def automated_testing(file, query, test_params, expected_result=None):
             params['apply_phonetic'],
             params['phonetic_weight']
         )
         if params['use_reranking']:
             reranker = pipeline("text-classification", model="cross-encoder/ms-marco-MiniLM-L-12-v2")
             results_raw = rerank_results(results_raw, query, reranker)
@@ -1022,17 +1022,27 @@ def analyze_results(stats_df):
         'contains_expected': 0.5,  # High weight for containing the expected result
         'expected_result_rank': -0.4  # Lower rank (closer to 1) is better
     }
     for metric in metric_weights.keys():
-        stats_df[metric] = pd.to_numeric(stats_df[metric], errors='coerce')
     stats_df['weighted_score'] = sum(
-        stats_df[metric].fillna(0) * weight
         for metric, weight in metric_weights.items()
     )
     best_config = stats_df.loc[stats_df['weighted_score'].idxmax()]
     recommendations = {
         'best_model': f"{best_config['model_type']} - {best_config['model_name']}",
         'best_settings': {
@@ -1059,7 +1069,7 @@ def analyze_results(stats_df):
             'expected_result_rank': int(best_config['expected_result_rank'])
         }
     }
     return recommendations
     ####
@@ -1069,72 +1079,85 @@ def get_llm_suggested_settings(file, num_chunks=1):
         return {"error": "No file uploaded"}
     chunks, _, _ = process_files(
-        file.name,
-        'HuggingFace',
-        'paraphrase-miniLM',
-        'recursive',
-        250,
         50,
         custom_separators=None
     )
     # Select a few random chunks
     sample_chunks = random.sample(chunks, min(num_chunks, len(chunks)))
-    # Prepare the prompt
-    prompt = f"""Given the following text chunks from a document, suggest optimal settings for an embedding-based search system. The settings should include:
-1. Embedding model type and name
-2. Split strategy (token or recursive)
-3. Chunk size
-4. Overlap size
-5. Vector store type (FAISS or Chroma)
-6. Search type (similarity, mmr, or custom)
-7. Top K results to retrieve
-8. Whether to apply preprocessing
-9. Whether to optimize vocabulary
-10. Whether to apply phonetic matching
-Expected output format:
-{{
-    "embedding_models": "embedding_model_type:embedding_model_name",
-    "split_strategy": "token or recursive",
-    "chunk_size": 250,
-    "overlap_size": 50,
-    "vector_store_type": "FAISS or Chroma",
-    "search_type": "similarity, mmr, or custom",
-    "top_k": 5,
-    "apply_preprocessing": True,
-    "optimize_vocab": True,
-    "apply_phonetic": False,
-    "phonetic_weight": 0.3  # Default value, as it's not in the LLM suggestions
-}}
-Text chunks:
-{' '.join(sample_chunks)}
-Provide your suggestions in a Python dictionary format."""
-    # Use a HuggingFace model for text generation
-    #model_id = "google/flan-t5-large"
-    #tokenizer = AutoTokenizer.from_pretrained(model_id)
-    #model = AutoModelForCausalLM.from_pretrained(model_id)
-    #pipe = pipeline(
-    #    "text-generation", model=model, tokenizer=tokenizer, max_new_tokens=512
-    #)
-    #llm = HuggingFacePipeline(pipeline=pipe)
-    #llm = HuggingFacePipeline(pipeline(model="HuggingFaceH4/zephyr-7b-beta"))
-    #llm = HuggingFacePipeline.from_model_id(
-    #    model_id="google/flan-t5-large",
-    #    task="text2text-generation",
-    #    model_kwargs={"do_sample": True, "temperature": 0.7, "max_new_tokens": 512},
-    #)
-    # Generate suggestions
-    suggested_settings = llm.invoke(prompt)
     print("setting suggested")
     print(suggested_settings)
     # Parse the generated text to extract the dictionary
@@ -1160,7 +1183,7 @@ Provide your suggestions in a Python dictionary format."""
 def update_inputs_with_llm_suggestions(suggestions):
     if suggestions is None or "error" in suggestions:
         return [gr.update() for _ in range(11)]  # Return no updates if there's an error or None
     return [
         gr.update(value=[suggestions["embedding_models"]]),  # embedding_models_input
         gr.update(value=suggestions["split_strategy"]),      # split_strategy_input
@@ -1178,16 +1201,16 @@ def update_inputs_with_llm_suggestions(suggestions):
 def parse_model_selections(default_models, custom_models):
     """
     Parse selected default models and custom models into model configurations
     Args:
         default_models (List[str]): Selected default models in format "type:name"
         custom_models (str): Custom models string with one model per line in format "type:name"
     Returns:
         List[Dict[str, str]]: List of model configurations with 'type' and 'name' keys
     """
     model_configs = []
     # Process default models
     if default_models:
         for model in default_models:
@@ -1196,7 +1219,7 @@ def parse_model_selections(default_models, custom_models):
                 'type': model_type,
                 'name': model_name
             })
     # Process custom models
     if custom_models:
         custom_model_lines = custom_models.strip().split('\n')
@@ -1207,7 +1230,7 @@ def parse_model_selections(default_models, custom_models):
                     'type': model_type.strip(),
                     'name': model_name.strip()
                 })
     return model_configs
 def parse_comma_separated(text):
@@ -1217,12 +1240,12 @@ def parse_comma_separated(text):
     return [x.strip() for x in text.split(',') if x.strip()]
 # Gradio Interface
 def launch_interface(debug=True):
     with gr.Blocks() as iface:
         gr.Markdown("# Advanced Embedding Comparison Tool")
         with gr.Tab("Simple"):
             file_input = gr.File(label="Upload File (Optional)")
             query_input = gr.Textbox(label="Search Query")
@@ -1237,7 +1260,7 @@ def launch_interface(debug=True):
                 label="Embedding Models"
             )
             top_k_input = gr.Slider(1, 10, step=1, value=5, label="Top K")
         with gr.Tab("Advanced"):
             custom_embedding_model_input = gr.Textbox(label="Custom Embedding Model (optional, format: type:name)")
             split_strategy_input = gr.Radio(choices=["token", "recursive"], label="Split Strategy", value="recursive")
@@ -1247,7 +1270,7 @@ def launch_interface(debug=True):
             vector_store_type_input = gr.Radio(choices=["FAISS", "Chroma"], label="Vector Store Type", value="FAISS")
             search_type_input = gr.Radio(choices=["similarity", "mmr", "custom"], label="Search Type", value="similarity")
             lang_input = gr.Dropdown(choices=["german", "english", "french"], label="Language", value="german")
         with gr.Tab("Expert"):
             apply_preprocessing_input = gr.Checkbox(label="Apply Text Preprocessing", value=False)
             optimize_vocab_input = gr.Checkbox(label="Optimize Vocabulary", value=False)
@@ -1265,7 +1288,7 @@ def launch_interface(debug=True):
             with gr.Row():
                 auto_file_input = gr.File(label="Upload File (Optional)")
                 auto_query_input = gr.Textbox(label="Search Query")
             with gr.Row():
                 auto_expected_result_input = gr.Textbox(
                     label="Expected Result (Optional)",
@@ -1275,18 +1298,18 @@ def launch_interface(debug=True):
                     label="Model Feedback (Optional)",
                     placeholder="Enter any feedback about model performance"
                 )
             with gr.Row():
                 with gr.Column():
                     # Default model selection
                     default_models_input = gr.CheckboxGroup(
-                        choices=[f"{type}:{name}"
-                                for type, names in DEFAULT_MODELS.items()
                                 for name in names],
                         label="Default Models",
                         value=[f"HuggingFace:{DEFAULT_MODELS['HuggingFace'][0]}"]
                     )
                 with gr.Column():
                     # Custom model input
                     custom_models_input = gr.TextArea(
@@ -1294,7 +1317,7 @@ def launch_interface(debug=True):
                         placeholder="Enter one model per line in format: type:name",
                         lines=3
                     )
             auto_split_strategies = gr.CheckboxGroup(
                 choices=["token", "recursive"],
                 label="Split Strategies to Test"
@@ -1313,21 +1336,21 @@ def launch_interface(debug=True):
             auto_optimize_vocab = gr.Checkbox(label="Test Vocabulary Optimization", value=True)
             auto_use_query_optimization = gr.Checkbox(label="Test Query Optimization", value=True)
             auto_use_reranking = gr.Checkbox(label="Test Reranking", value=True)
             auto_results_output = gr.Dataframe(label="Automated Test Results", interactive=False)
             auto_stats_output = gr.Dataframe(label="Automated Test Statistics", interactive=False)
             recommendations_output = gr.JSON(label="Recommendations")
             def run_automation(file_input, query_input, expected_result, default_models, custom_models,
                               split_strategies, chunk_sizes, overlap_sizes,
                               vector_store_types, search_types, top_k_values,
                               optimize_vocab, use_query_optimization, use_reranking,
                               model_feedback):
                 """Wrapper function to handle Gradio inputs and run automated tests"""
                 # Parse model configurations
                 model_configs = parse_model_selections(default_models, custom_models)
                 # Parse test parameters
                 test_params = {
                     'split_strategy': split_strategies,
@@ -1346,7 +1369,7 @@ def launch_interface(debug=True):
                     'custom_separators': [None],
                     'query_optimization_model': ['google/flan-t5-base']  # Default query optimization model
                 }
                 # Run automated tests
                 results_df, stats_df = run_automated_tests(
                     file_input.name if file_input else None,
@@ -1356,12 +1379,12 @@ def launch_interface(debug=True):
                     expected_result if expected_result else None,
                     model_feedback if model_feedback else None
                 )
                 # Generate recommendations based on results
                 recommendations =  analyze_results(stats_df)
                 return results_df, stats_df, recommendations
             auto_submit_button = gr.Button("Run Automated Tests")
             auto_submit_button.click(
                 fn=run_automation,
@@ -1376,25 +1399,25 @@ def launch_interface(debug=True):
                 outputs=[auto_results_output, auto_stats_output, recommendations_output]
             )
         ###
         with gr.Tab("Results"):
             with gr.Row():
                 results_output = gr.DataFrame(label="Results")
                 stats_output = gr.DataFrame(label="Statistics")
             with gr.Row():
                 plot_output = gr.Plot(label="Visualizations")
                 model_rankings_output = gr.JSON(label="Model Rankings")
             with gr.Row():
                 recommendations_output = gr.JSON(label="Recommendations")
         with gr.Tab("LLM Suggestions"):
             llm_file_input = gr.File(label="Upload File for LLM Suggestions")
             llm_num_chunks = gr.Slider(1, 10, step=1, value=5, label="Number of Sample Chunks")
             llm_suggest_button = gr.Button("Get LLM Suggestions")
             llm_suggestions_output = gr.JSON(label="LLM-suggested Settings")
         llm_suggest_button.click(
             fn=get_llm_suggested_settings,
             inputs=[llm_file_input, llm_num_chunks],
@@ -1403,9 +1426,9 @@ def launch_interface(debug=True):
             fn=update_inputs_with_llm_suggestions,
             inputs=[llm_suggestions_output],
             outputs=[
-                embedding_models_input, split_strategy_input, chunk_size_input,
-                overlap_size_input, vector_store_type_input, search_type_input,
-                top_k_input, apply_preprocessing_input, optimize_vocab_input,
                 apply_phonetic_input, phonetic_weight_input
             ]
         )
@@ -1526,7 +1549,7 @@ Create a simple chat interface and test with various queries about the AI Act. F
 User: "Was sind die Hauptziele des KI-Gesetzes?"
     """
     tutorial_md = """
 # Advanced Embedding Comparison Tool Tutorial
@@ -1675,13 +1698,13 @@ Measures how well an object fits within its own cluster compared to others. Scor
 def create_custom_tokenizer(file_path, model_type='WordLevel', vocab_size=10000, special_tokens=None):
     with open(file_path, 'r', encoding='utf-8') as f:
         text = f.read()
     tokenizer = Tokenizer(models.WordLevel(unk_token="[UNK]")) if model_type == 'WordLevel' else Tokenizer(models.BPE(unk_token="[UNK]"))
     tokenizer.pre_tokenizer = Whitespace()
     trainer = trainers.WordLevelTrainer(special_tokens=special_tokens or ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"], vocab_size=vocab_size)
     tokenizer.train_from_iterator([text], trainer)
     return tokenizer
 ````
@@ -1713,39 +1736,39 @@ def rerank_results(results, query, reranker):
     ## Useful Resources and Links
     Here are some valuable resources to help you better understand and work with embeddings, retrieval systems, and natural language processing:
     ### Embeddings and Vector Databases
     - [Understanding Embeddings](https://www.tensorflow.org/text/guide/word_embeddings): A guide by TensorFlow on word embeddings
     - [FAISS: A Library for Efficient Similarity Search](https://github.com/facebookresearch/faiss): Facebook AI's vector similarity search library
     - [Chroma: The AI-native open-source embedding database](https://www.trychroma.com/): An embedding database designed for AI applications
     ### Natural Language Processing
     - [NLTK (Natural Language Toolkit)](https://www.nltk.org/): A leading platform for building Python programs to work with human language data
     - [spaCy](https://spacy.io/): Industrial-strength Natural Language Processing in Python
     - [Hugging Face Transformers](https://huggingface.co/transformers/): State-of-the-art Natural Language Processing for PyTorch and TensorFlow 2.0
     ### Retrieval-Augmented Generation (RAG)
     - [LangChain](https://python.langchain.com/docs/get_started/introduction): A framework for developing applications powered by language models
     - [OpenAI's RAG Tutorial](https://platform.openai.com/docs/tutorials/web-qa-embeddings): A guide on building a QA system with embeddings
     ### German Language Processing
     - [Kölner Phonetik](https://en.wikipedia.org/wiki/Cologne_phonetics): Information about the Kölner Phonetik algorithm
     - [German NLP Resources](https://github.com/adbar/German-NLP): A curated list of open-access resources for German NLP
     ### Benchmarks and Evaluation
     - [MTEB Leaderboard](https://huggingface.co/spaces/mteb/leaderboard): Massive Text Embedding Benchmark leaderboard
     - [GLUE Benchmark](https://gluebenchmark.com/): General Language Understanding Evaluation benchmark
     ### Tools and Libraries
     - [Gensim](https://radimrehurek.com/gensim/): Topic modelling for humans
     - [Sentence-Transformers](https://www.sbert.net/): A Python framework for state-of-the-art sentence, text and image embeddings
        ### Support me
     - [Visual Crew Builder](https://visual-crew.builder.ai/): Tool for create AI systems, workflows and api. Or just a notebook.
 This tool empowers you to fine-tune your RAG system for optimal performance. Experiment with different settings, run automated tests, and use insights to create an efficient information retrieval and generation system.
@@ -1768,7 +1791,7 @@ def create_chat_app(settings):
             settings['lang'],
             settings['apply_preprocessing']
         )
         results, _, _, _ = search_embeddings(
             chunks,
             embedding_model,
@@ -1780,12 +1803,12 @@ def create_chat_app(settings):
             apply_phonetic=settings['apply_phonetic'],
             phonetic_weight=settings['phonetic_weight']
         )
         # Generate a response based on the retrieved results
         response = f"Based on the query '{message}', here are the top {settings['top_k']} relevant results:\n\n"
         for i, result in enumerate(results[:settings['top_k']]):
             response += f"{i+1}. {result['content'][:100]}...\n\n"
         return response
     with gr.Blocks() as chat_interface:
@@ -1823,7 +1846,7 @@ if __name__ == "__main__":
     launch_interface()
     # Uncomment the following line to launch the sample chat app
 ´´´
         """
@@ -1832,10 +1855,10 @@ if __name__ == "__main__":
         ["Embedding Comparison", "Tutorial", "Use Case"]
     )
-    iface.launch(debug=dubug)
 # Enhanced Automated Testing
-def run_automated_tests(file_path: str, query: str, model_configs: List[Dict[str, str]],
                        test_params: Dict[str, List[Any]], expected_result: Optional[str] = None,
                        model_feedback: Optional[str] = None) -> Tuple[pd.DataFrame, pd.DataFrame]:
     """
@@ -1844,16 +1867,16 @@ def run_automated_tests(file_path: str, query: str, model_configs: List[Dict[str
     all_results = []
     all_stats = []
     model_manager = ModelManager()
     # Create parameter grid excluding model configurations
     base_params = {k: v for k, v in test_params.items() if k not in ['model_type', 'model_name']}
     param_grid = ParameterGrid(base_params)
     # Test each model configuration with all parameter combinations
     for model_config in tqdm(model_configs, desc="Testing models"):
         model_type = model_config['type']
         model_name = model_config['name']
         for params in tqdm(param_grid, desc=f"Testing parameters for {model_type}:{model_name}"):
             try:
                 # Process files and get chunks
@@ -1868,11 +1891,11 @@ def run_automated_tests(file_path: str, query: str, model_configs: List[Dict[str
                     params['lang'],
                     params['apply_preprocessing']
                 )
                 # Apply vocabulary optimization if specified
                 if params['optimize_vocab']:
                     tokenizer, chunks = optimize_vocabulary(chunks)
                 # Apply query optimization if specified
                 current_query = query
                 if params['use_query_optimization']:
@@ -1886,7 +1909,7 @@ def run_automated_tests(file_path: str, query: str, model_configs: List[Dict[str
                         params['top_k']
                     )
                     current_query = " ".join(optimized_queries)
                 # Perform search
                 results, search_time, vector_store, raw_results = search_embeddings(
                     chunks,
@@ -1900,25 +1923,25 @@ def run_automated_tests(file_path: str, query: str, model_configs: List[Dict[str
                     params['apply_phonetic'],
                     params['phonetic_weight']
                 )
                 # Apply reranking if specified
                 if params['use_reranking']:
-                    reranker = pipeline("text-classification",
                                       model="cross-encoder/ms-marco-MiniLM-L-12-v2")
                     raw_results = rerank_results(raw_results, current_query, reranker)
                 # Calculate statistics
                 stats = ResultAnalyzer.calculate_statistics(
                     raw_results, search_time, vector_store, num_tokens,
                     embedding_model, current_query, params['top_k'],
                     expected_result, model_feedback
                 )
                 # Update model rankings
                 model_id = f"{model_type}:{model_name}"
                 ranking_score = calculate_model_ranking_score(stats)
                 model_manager.update_model_ranking(model_id, ranking_score, model_feedback)
                 # Add model information to stats
                 stats.update({
                     "model_type": model_type,
@@ -1926,15 +1949,15 @@ def run_automated_tests(file_path: str, query: str, model_configs: List[Dict[str
                     "model": f"{model_type} - {model_name}",
                     **params
                 })
                 # Format and store results
                 all_results.extend(format_results(raw_results, stats))
                 all_stats.append(stats)
             except Exception as e:
                 print(f"Error testing {model_type}:{model_name} with parameters {params}: {str(e)}")
                 continue
     return pd.DataFrame(all_results), pd.DataFrame(all_stats)
     # Helper function to calculate model ranking score
@@ -1947,7 +1970,7 @@ def calculate_model_ranking_score(stats: Dict[str, Any]) -> float:
         'contains_expected': 0.3,
         'expected_result_rank': -0.2  # Negative weight because lower rank is better
     }
     score = 0.0
     for metric, weight in weights.items():
         if metric in stats and not isinstance(stats[metric], str):
@@ -1958,9 +1981,8 @@ def calculate_model_ranking_score(stats: Dict[str, Any]) -> float:
             else:
                 value = float(stats[metric])
             score += weight * value
     return score
 if __name__ == "__main__":
     launch_interface()

 from typing import List, Tuple, Optional
+#hf_token = os.getenv("hf_token")
+#login(token=hf_token)
 # Define the model pipeline with additional generation parameters
 #model_pipeline = pipeline(
             }
         }
     def update_model_ranking(self, model_id: str, score: float, feedback: str = None):
         """Update model ranking based on performance and optional feedback"""
         current_score = self.rankings.get(model_id, 0.0)
         # Weighted average of current score and new score
         self.rankings[model_id] = 0.7 * current_score + 0.3 * score
         if feedback:
             if model_id not in self.model_stats:
                 self.model_stats[model_id] = {"feedback_count": 0, "feedback": []}
             self.model_stats[model_id]["feedback_count"] += 1
             self.model_stats[model_id]["feedback"].append(feedback)
     def get_top_models(self, n: int = 5) -> List[Tuple[str, float]]:
         """Get top n ranked models"""
         return sorted(self.rankings.items(), key=lambda x: x[1], reverse=True)[:n]
     def get_model_stats(self, model_id: str) -> Dict[str, Any]:
         """Get statistics for a specific model"""
         return self.model_stats.get(model_id, {})
     def add_model(self, provider, name, model_path):
         if provider not in self.models:
             self.models[provider] = {}
 def preprocess_text(text, lang='german', apply_preprocessing=False):
     if not apply_preprocessing:
         return text
     text = text.lower()
     text = re.sub(r'[^a-zA-Z\s]', '', text)
     try:
         tokens = word_tokenize(text, language=lang)
     except LookupError:
         print(f"Warning: NLTK punkt tokenizer for {lang} not found. Using simple tokenization.")
         tokens = simple_tokenize(text)
     try:
         stop_words = set(stopwords.words(lang))
     except LookupError:
         print(f"Warning: Stopwords for {lang} not found. Skipping stopword removal.")
         stop_words = set()
     tokens = [token for token in tokens if token not in stop_words]
     try:
         stemmer = SnowballStemmer(lang)
         tokens = [stemmer.stem(token) for token in tokens]
     except ValueError:
         print(f"Warning: SnowballStemmer for {lang} not available. Skipping stemming.")
     return ' '.join(tokens)
 def phonetic_match(text, query, method='levenshtein_distance', apply_phonetic=False):
 ) -> str:
     """
     CPU-optimized version of query expansion using a small language model.
     Args:
         query: Original search query
         query_optimization_model: Name or path of the model to use for optimization
         search_type: Type of search being performed
         top_k: Number of expansion terms to add
         use_gpu: Whether to use GPU if available (defaults to False for CPU)
     Returns:
         Expanded query string
     """
     try:
         # Set device
         device = "cuda" if use_gpu and torch.cuda.is_available() else "cpu"
         # 1. Basic text preprocessing (CPU-based)
         tokens = word_tokenize(query.lower())
         # 2. WordNet synonyms expansion (CPU-based)
         expanded_terms = set()
         for token in tokens:
             for syn in synsets:
                 # Limit number of lemmas
                 expanded_terms.update([lemma.name() for lemma in syn.lemmas()[:2]])
         # 3. Use provided model with reduced complexity
         try:
             # Load model with reduced memory footprint
                 low_cpu_mem_usage=True,
                 device_map="cpu"
             )
             # Move model to CPU and eval mode
             model = model.to(device)
             model.eval()
             # Prepare input with reduced length
             prompt = f"Enhance this search query with relevant terms: {query}"
             inputs = tokenizer(
                 truncation=True,
                 padding=True
             )
             # Generate with minimal parameters
             with torch.no_grad():
                 outputs = model.generate(
                     do_sample=False,
                     early_stopping=True
                 )
             enhanced_query = tokenizer.decode(outputs[0], skip_special_tokens=True)
             # Clear CUDA cache if GPU was used
             if device == "cuda":
                 torch.cuda.empty_cache()
         except Exception as model_error:
             print(f"Model-based expansion failed: {str(model_error)}")
             enhanced_query = query
         # 4. Combine original and expanded terms
         final_terms = set(tokens)
         final_terms.update(expanded_terms)
         if enhanced_query != query:
             final_terms.update(word_tokenize(enhanced_query.lower()))
         # 5. Remove stopwords and select top_k most relevant terms
         stopwords = set(['the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to'])
         final_terms = [term for term in final_terms if term not in stopwords]
         # Combine with original query
         expanded_query = f"{query} {' '.join(list(final_terms)[:top_k])}"
         # Clean up
         del model
         del tokenizer
         if device == "cuda":
             torch.cuda.empty_cache()
+        return expanded_query.strip() #[Document(page_content=expanded_query.strip())]
     except Exception as e:
         print(f"Query optimization failed: {str(e)}")
+        return query #[Document(page_content=query)]  # Return original query if optimization fails
     use_gpu=False  # Explicitly use CPU
 )
 """
 def create_custom_embedding(texts, model_type='word2vec', vector_size=100, window=5, min_count=1):
     tokenized_texts = [text.split() for text in texts]
     if model_type == 'word2vec':
         model = Word2Vec(sentences=tokenized_texts, vector_size=vector_size, window=window, min_count=min_count, workers=4)
     elif model_type == 'fasttext':
         model = FastText(sentences=tokenized_texts, vector_size=vector_size, window=window, min_count=min_count, workers=4)
     else:
         raise ValueError("Unsupported model type")
     return model
 class CustomEmbeddings(HuggingFaceEmbeddings):
     def __init__(self, model_path):
         self.model = Word2Vec.load(model_path)  # or FastText.load() for FastText models
     def embed_documents(self, texts):
         return [self.model.wv[text.split()] for text in texts]
     def embed_query(self, text):
         return self.model.wv[text.split()]
             chunk_size=chunk_size,
             chunk_overlap=overlap_size,
             add_start_index=True,  # If `True`, includes chunk's start index in metadata
+            strip_whitespace=True,  # If `True`, strips whitespace from the start and end of every document
             separators=custom_separators or ["\n\n", "\n", " ", ""]
         )
     else:
             multi_process=True,
            # model_kwargs={"device": "cpu"},
             #encode_kwargs={"normalize_embeddings": True},  # Set `True` for cosine similarity
+        )
     elif model_type == 'OpenAI':
         return OpenAIEmbeddings(model=model_path)
     elif model_type == 'Cohere':
     phonetic_sim = phonetic_match(doc_text, query)
     combined_sim = (1 - phonetic_weight) * embedding_sim + phonetic_weight * phonetic_sim
     return combined_sim
 def _create_vector_store(vector_store_type, chunks_tuple, embedding_model):
     chunks = list(chunks_tuple)
     if vector_store_type == 'FAISS':
         return FAISS.from_texts(chunks, embedding_model)
     elif vector_store_type == 'Chroma':
         for file in os.listdir(FILES_DIR):
             file_path = os.path.join(FILES_DIR, file)
             text += FileHandler.extract_text(file_path)
     if custom_tokenizer_file:
         tokenizer = create_custom_tokenizer(custom_tokenizer_file, custom_tokenizer_model, custom_tokenizer_vocab_size, custom_tokenizer_special_tokens)
         text = ' '.join(custom_tokenize(text, tokenizer))
 def search_embeddings(chunks, embedding_model, vector_store_type, search_type, query, top_k, expected_result=None, lang='german', apply_phonetic=False, phonetic_weight=0.3):
     preprocessed_query = preprocess_text(query, lang) if apply_phonetic else query
     vector_store = get_vector_store(vector_store_type, chunks, embedding_model)
     retriever = get_retriever(vector_store, search_type, {"k": top_k})
     #this should be optional
     def score_result(doc):
         base_score = vector_store.similarity_search_with_score(doc.page_content, k=1)[0][1]
         # Add bonus for containing expected result
         expected_bonus = 0.3 if expected_result and expected_result in doc.page_content else 0
         if apply_phonetic:
             phonetic_score = phonetic_match(doc.page_content, query)
             return (1 - phonetic_weight) * base_score + phonetic_weight * phonetic_score + expected_bonus
 # Enhanced Result Analysis
 class ResultAnalyzer:
     @staticmethod
+    def calculate_statistics(results, search_time, vector_store, num_tokens, embedding_model, query,
                            top_k, expected_result=None, model_feedback=None):
         stats = {
             "num_results": len(results),
             "embedding_dimension": len(embedding_model.embed_query(query)),
             "top_k": top_k,
         }
         # Add vector store statistics
         try:
             if hasattr(vector_store, '_index'):
                 stats["vector_store_size"] = len(vector_store._collection.get())
         except:
             stats["vector_store_size"] = "N/A"
         # Add expected result statistics if provided
         if expected_result:
             stats["contains_expected"] = any(expected_result in doc.page_content for doc in results)
+            stats["expected_result_rank"] = next((i for i, doc in enumerate(results)
                                                 if expected_result in doc.page_content), -1) + 1
         # Calculate diversity metrics for larger result sets
         if len(results) > 3:  # Changed from 1000 to make it more practical
             embeddings = [embedding_model.embed_query(doc.page_content) for doc in results]
         else:
             stats["result_diversity"] = "N/A"
             stats["silhouette_score"] = "N/A"
         # Add ranking correlation
         query_embedding = embedding_model.embed_query(query)
         result_embeddings = [embedding_model.embed_query(doc.page_content) for doc in results]
             stats["rank_correlation"] = rank_correlation
         else:
             stats["rank_correlation"] = "N/A"
         # Add model feedback if provided
         if model_feedback:
             stats["model_feedback"] = model_feedback
         return stats
     @staticmethod
     def _calculate_diversity(embeddings: List[np.ndarray]) -> float:
         """Calculate diversity score for embeddings"""
         embeddings_array = np.array(embeddings)
         pairwise_similarities = np.inner(embeddings_array, embeddings_array)
         return 1 - np.mean(pairwise_similarities[np.triu_indices(len(embeddings), k=1)])
     @staticmethod
     def _calculate_silhouette(embeddings: List[np.ndarray]) -> float:
         """Calculate silhouette score for embeddings"""
     # Add model column if not present
     if 'model' not in stats_df.columns:
         stats_df['model'] = stats_df['model_type'] + ' - ' + stats_df['model_name']
     fig, axs = plt.subplots(2, 2, figsize=(20, 20))
     # Handle empty dataframe case
     if len(stats_df) == 0:
         return fig
     # Create plots with error handling
     try:
         sns.barplot(data=stats_df, x='model', y='search_time', ax=axs[0, 0])
         axs[0, 0].tick_params(axis='x', rotation=45)
     except Exception as e:
         print(f"Error in search time plot: {e}")
     try:
+        sns.scatterplot(data=stats_df, x='result_diversity', y='rank_correlation',
                        hue='model', ax=axs[0, 1])
         axs[0, 1].set_title('Result Diversity vs. Rank Correlation')
     except Exception as e:
         print(f"Error in diversity plot: {e}")
     try:
         sns.boxplot(data=stats_df, x='model', y='avg_content_length', ax=axs[1, 0])
         axs[1, 0].set_title('Distribution of Result Content Lengths')
         axs[1, 0].tick_params(axis='x', rotation=45)
     except Exception as e:
         print(f"Error in content length plot: {e}")
     try:
         valid_embeddings = results_df['embedding'].dropna().values
         if len(valid_embeddings) > 1:
             tsne = TSNE(n_components=2, random_state=42)
             embeddings_2d = tsne.fit_transform(np.vstack(valid_embeddings))
+            sns.scatterplot(x=embeddings_2d[:, 0], y=embeddings_2d[:, 1],
+                          hue=results_df['Model'][:len(valid_embeddings)],
                           ax=axs[1, 1])
             axs[1, 1].set_title('t-SNE Visualization of Result Embeddings')
         else:
+            axs[1, 1].text(0.5, 0.5, "Not enough embeddings for visualization",
                           ha='center', va='center')
     except Exception as e:
         print(f"Error in embedding visualization: {e}")
     plt.tight_layout()
     return fig
 #plt.title("Distribution of document lengths in the knowledge base (in count of tokens)")
 #plt.show()
 def optimize_vocabulary(texts, vocab_size=10000, min_frequency=2):
     tokenizer = Tokenizer(models.BPE(unk_token="[UNK]"))
     word_freq = Counter(word for text in texts for word in text.split())
     optimized_texts = [
         ' '.join(word for word in text.split() if word_freq[word] >= min_frequency)
         for text in texts
     ]
     trainer = trainers.BpeTrainer(vocab_size=vocab_size, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
     tokenizer.train_from_iterator(optimized_texts, trainer)
     return tokenizer, optimized_texts
 import numpy as np
 from transformers import TextClassificationPipeline
 from typing import List, Union, Any
 model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
 def rerank_results(
+    results: List[Any],
+    query: str,
     reranker: Union[TextClassificationPipeline, Any]
 ) -> List[Any]:
     """
     """
     if not results:
         return results
     # Step 1: Encode the query and documents using SentenceTransformer
     query_embedding = model.encode(query, convert_to_tensor=True)
     doc_contents = [doc.page_content for doc in results]  # Assuming each result has a `page_content` attribute
     doc_embeddings = model.encode(doc_contents, convert_to_tensor=True)
     # Step 2: Compute cosine similarities between query and document embeddings
     cosine_scores = util.cos_sim(query_embedding, doc_embeddings)[0]  # Shape: (number of documents,)
     # Step 3: Sort documents by similarity score in descending order
+    reranked_idx = np.argsort(cosine_scores.cpu().numpy())[::-1]
     # Step 4: Return the reranked documents
     reranked_results = [results[i] for i in reranked_idx]
     return reranked_results
         if optimize_vocab:
             tokenizer, optimized_chunks = optimize_vocabulary(chunks)
             chunks = optimized_chunks
         search_query = query
         if use_query_optimization:
             optimized_queries = optimize_query(query, query_optimization_model, chunks, embedding_model, vector_store_type, search_type, top_k)
             #query = " ".join(optimized_queries)
+            search_query = optimized_queries # " ".join([doc.page_content for doc in optimized_queries])  # Extract text from Document objects
         results, search_time, vector_store, results_raw = search_embeddings(
             chunks,
             lang,
             apply_phonetic,
             phonetic_weight
+        )
         if use_reranking:
             reranker = pipeline("text-classification", model="cross-encoder/ms-marco-MiniLM-L-12-v2")
             results_raw = rerank_results(results_raw, query, reranker)
 def automated_testing(file, query, test_params, expected_result=None):
     all_results = []
     all_stats = []
     param_grid = ParameterGrid(test_params)
     print(param_grid)
     for params in tqdm(param_grid, desc="Running tests"):
             params['apply_phonetic'],
             params['phonetic_weight']
         )
         if params['use_reranking']:
             reranker = pipeline("text-classification", model="cross-encoder/ms-marco-MiniLM-L-12-v2")
             results_raw = rerank_results(results_raw, query, reranker)
         'contains_expected': 0.5,  # High weight for containing the expected result
         'expected_result_rank': -0.4  # Lower rank (closer to 1) is better
     }
+    if stats_df.empty:
+      print("stats_df is empty. Cannot compute best configuration.")
+      return None
     for metric in metric_weights.keys():
+        if metric in stats_df.columns:
+            stats_df[metric] = pd.to_numeric(stats_df[metric], errors='coerce')
+        else:
+            stats_df[metric] = 0
+            print("Column 'search_time' is missing in stats_df.")
     stats_df['weighted_score'] = sum(
+        stats_df[metric].fillna(0) * weight
         for metric, weight in metric_weights.items()
     )
     best_config = stats_df.loc[stats_df['weighted_score'].idxmax()]
     recommendations = {
         'best_model': f"{best_config['model_type']} - {best_config['model_name']}",
         'best_settings': {
             'expected_result_rank': int(best_config['expected_result_rank'])
         }
     }
     return recommendations
     ####
         return {"error": "No file uploaded"}
     chunks, _, _ = process_files(
+        file.name,
+        'HuggingFace',
+        'paraphrase-miniLM',
+        'recursive',
+        250,
         50,
         custom_separators=None
     )
     # Select a few random chunks
     sample_chunks = random.sample(chunks, min(num_chunks, len(chunks)))
+    llm_pipeline = pipeline(model="meta-llama/Llama-3.2-1B-Instruct", device='cuda')
+    prompt=f'''
+    <|start_header_id|>system<|end_header_id|>
+    You are an expert in information retrieval.
+    You know about strenghs and weaknesses of all models.
+    Given the following text chunks from a document,
+    suggest optimal settings for an embedding-based search system. The settings should include:
+    1. Embedding model type and name
+    2. Split strategy (token or recursive)
+    3. Chunk size
+    4. Overlap size
+    5. Vector store type (FAISS or Chroma)
+    6. Search type (similarity, mmr, or custom)
+    7. Top K results to retrieve
+    8. Whether to apply preprocessing
+    9. Whether to optimize vocabulary
+    10. Whether to apply phonetic matching
+    Expected output format:
+    {{
+        "embedding_models": "embedding_model_type:embedding_model_name",
+        "split_strategy": "token or recursive",
+        "chunk_size": 250,
+        "overlap_size": 50,
+        "vector_store_type": "FAISS or Chroma",
+        "search_type": "similarity, mmr, or custom",
+        "top_k": 5,
+        "apply_preprocessing": True,
+        "optimize_vocab": True,
+        "apply_phonetic": False,
+        "phonetic_weight": 0.3  #
+    }}
+    Provide your suggestions in a Python dictionary format.
+    show me settings You SHOULD NOT include any other text in the response.
+    Fill out the seeting and chose usefull values.
+    Respect the users use cases and content snipet. Choose the setting based on the chunks
+    <|eot_id|><|start_header_id|>user<|end_header_id|>
+    User user case:
+    {"small local", "large total context", ...}
+    total content lenght:
+    {len(' '.join(chunks))}
+    Content snipet:
+    {' '.join(sample_chunks)}
+    <|eot_id|><|start_header_id|>assistant<|end_header_id|>
+    '''
+    suggested_settings = llm_pipeline(
+            prompt,
+            do_sample=True,
+            top_k=10,
+            num_return_sequences=1,
+            return_full_text=False,
+            max_new_tokens=1900,    # Control the length of the output,
+            truncation=True,  # Enable truncation
+        )
+    #suggested_settings = llm.invoke(prompt)
     print("setting suggested")
     print(suggested_settings)
     # Parse the generated text to extract the dictionary
 def update_inputs_with_llm_suggestions(suggestions):
     if suggestions is None or "error" in suggestions:
         return [gr.update() for _ in range(11)]  # Return no updates if there's an error or None
     return [
         gr.update(value=[suggestions["embedding_models"]]),  # embedding_models_input
         gr.update(value=suggestions["split_strategy"]),      # split_strategy_input
 def parse_model_selections(default_models, custom_models):
     """
     Parse selected default models and custom models into model configurations
     Args:
         default_models (List[str]): Selected default models in format "type:name"
         custom_models (str): Custom models string with one model per line in format "type:name"
     Returns:
         List[Dict[str, str]]: List of model configurations with 'type' and 'name' keys
     """
     model_configs = []
     # Process default models
     if default_models:
         for model in default_models:
                 'type': model_type,
                 'name': model_name
             })
     # Process custom models
     if custom_models:
         custom_model_lines = custom_models.strip().split('\n')
                     'type': model_type.strip(),
                     'name': model_name.strip()
                 })
     return model_configs
 def parse_comma_separated(text):
     return [x.strip() for x in text.split(',') if x.strip()]
 # Gradio Interface
 def launch_interface(debug=True):
     with gr.Blocks() as iface:
         gr.Markdown("# Advanced Embedding Comparison Tool")
         with gr.Tab("Simple"):
             file_input = gr.File(label="Upload File (Optional)")
             query_input = gr.Textbox(label="Search Query")
                 label="Embedding Models"
             )
             top_k_input = gr.Slider(1, 10, step=1, value=5, label="Top K")
         with gr.Tab("Advanced"):
             custom_embedding_model_input = gr.Textbox(label="Custom Embedding Model (optional, format: type:name)")
             split_strategy_input = gr.Radio(choices=["token", "recursive"], label="Split Strategy", value="recursive")
             vector_store_type_input = gr.Radio(choices=["FAISS", "Chroma"], label="Vector Store Type", value="FAISS")
             search_type_input = gr.Radio(choices=["similarity", "mmr", "custom"], label="Search Type", value="similarity")
             lang_input = gr.Dropdown(choices=["german", "english", "french"], label="Language", value="german")
         with gr.Tab("Expert"):
             apply_preprocessing_input = gr.Checkbox(label="Apply Text Preprocessing", value=False)
             optimize_vocab_input = gr.Checkbox(label="Optimize Vocabulary", value=False)
             with gr.Row():
                 auto_file_input = gr.File(label="Upload File (Optional)")
                 auto_query_input = gr.Textbox(label="Search Query")
             with gr.Row():
                 auto_expected_result_input = gr.Textbox(
                     label="Expected Result (Optional)",
                     label="Model Feedback (Optional)",
                     placeholder="Enter any feedback about model performance"
                 )
             with gr.Row():
                 with gr.Column():
                     # Default model selection
                     default_models_input = gr.CheckboxGroup(
+                        choices=[f"{type}:{name}"
+                                for type, names in DEFAULT_MODELS.items()
                                 for name in names],
                         label="Default Models",
                         value=[f"HuggingFace:{DEFAULT_MODELS['HuggingFace'][0]}"]
                     )
                 with gr.Column():
                     # Custom model input
                     custom_models_input = gr.TextArea(
                         placeholder="Enter one model per line in format: type:name",
                         lines=3
                     )
             auto_split_strategies = gr.CheckboxGroup(
                 choices=["token", "recursive"],
                 label="Split Strategies to Test"
             auto_optimize_vocab = gr.Checkbox(label="Test Vocabulary Optimization", value=True)
             auto_use_query_optimization = gr.Checkbox(label="Test Query Optimization", value=True)
             auto_use_reranking = gr.Checkbox(label="Test Reranking", value=True)
             auto_results_output = gr.Dataframe(label="Automated Test Results", interactive=False)
             auto_stats_output = gr.Dataframe(label="Automated Test Statistics", interactive=False)
             recommendations_output = gr.JSON(label="Recommendations")
             def run_automation(file_input, query_input, expected_result, default_models, custom_models,
                               split_strategies, chunk_sizes, overlap_sizes,
                               vector_store_types, search_types, top_k_values,
                               optimize_vocab, use_query_optimization, use_reranking,
                               model_feedback):
                 """Wrapper function to handle Gradio inputs and run automated tests"""
                 # Parse model configurations
                 model_configs = parse_model_selections(default_models, custom_models)
                 # Parse test parameters
                 test_params = {
                     'split_strategy': split_strategies,
                     'custom_separators': [None],
                     'query_optimization_model': ['google/flan-t5-base']  # Default query optimization model
                 }
                 # Run automated tests
                 results_df, stats_df = run_automated_tests(
                     file_input.name if file_input else None,
                     expected_result if expected_result else None,
                     model_feedback if model_feedback else None
                 )
                 # Generate recommendations based on results
                 recommendations =  analyze_results(stats_df)
                 return results_df, stats_df, recommendations
             auto_submit_button = gr.Button("Run Automated Tests")
             auto_submit_button.click(
                 fn=run_automation,
                 outputs=[auto_results_output, auto_stats_output, recommendations_output]
             )
         ###
         with gr.Tab("Results"):
             with gr.Row():
                 results_output = gr.DataFrame(label="Results")
                 stats_output = gr.DataFrame(label="Statistics")
             with gr.Row():
                 plot_output = gr.Plot(label="Visualizations")
                 model_rankings_output = gr.JSON(label="Model Rankings")
             with gr.Row():
                 recommendations_output = gr.JSON(label="Recommendations")
         with gr.Tab("LLM Suggestions"):
             llm_file_input = gr.File(label="Upload File for LLM Suggestions")
             llm_num_chunks = gr.Slider(1, 10, step=1, value=5, label="Number of Sample Chunks")
             llm_suggest_button = gr.Button("Get LLM Suggestions")
             llm_suggestions_output = gr.JSON(label="LLM-suggested Settings")
         llm_suggest_button.click(
             fn=get_llm_suggested_settings,
             inputs=[llm_file_input, llm_num_chunks],
             fn=update_inputs_with_llm_suggestions,
             inputs=[llm_suggestions_output],
             outputs=[
+                embedding_models_input, split_strategy_input, chunk_size_input,
+                overlap_size_input, vector_store_type_input, search_type_input,
+                top_k_input, apply_preprocessing_input, optimize_vocab_input,
                 apply_phonetic_input, phonetic_weight_input
             ]
         )
 User: "Was sind die Hauptziele des KI-Gesetzes?"
     """
     tutorial_md = """
 # Advanced Embedding Comparison Tool Tutorial
 def create_custom_tokenizer(file_path, model_type='WordLevel', vocab_size=10000, special_tokens=None):
     with open(file_path, 'r', encoding='utf-8') as f:
         text = f.read()
     tokenizer = Tokenizer(models.WordLevel(unk_token="[UNK]")) if model_type == 'WordLevel' else Tokenizer(models.BPE(unk_token="[UNK]"))
     tokenizer.pre_tokenizer = Whitespace()
     trainer = trainers.WordLevelTrainer(special_tokens=special_tokens or ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"], vocab_size=vocab_size)
     tokenizer.train_from_iterator([text], trainer)
     return tokenizer
 ````
     ## Useful Resources and Links
     Here are some valuable resources to help you better understand and work with embeddings, retrieval systems, and natural language processing:
     ### Embeddings and Vector Databases
     - [Understanding Embeddings](https://www.tensorflow.org/text/guide/word_embeddings): A guide by TensorFlow on word embeddings
     - [FAISS: A Library for Efficient Similarity Search](https://github.com/facebookresearch/faiss): Facebook AI's vector similarity search library
     - [Chroma: The AI-native open-source embedding database](https://www.trychroma.com/): An embedding database designed for AI applications
     ### Natural Language Processing
     - [NLTK (Natural Language Toolkit)](https://www.nltk.org/): A leading platform for building Python programs to work with human language data
     - [spaCy](https://spacy.io/): Industrial-strength Natural Language Processing in Python
     - [Hugging Face Transformers](https://huggingface.co/transformers/): State-of-the-art Natural Language Processing for PyTorch and TensorFlow 2.0
     ### Retrieval-Augmented Generation (RAG)
     - [LangChain](https://python.langchain.com/docs/get_started/introduction): A framework for developing applications powered by language models
     - [OpenAI's RAG Tutorial](https://platform.openai.com/docs/tutorials/web-qa-embeddings): A guide on building a QA system with embeddings
     ### German Language Processing
     - [Kölner Phonetik](https://en.wikipedia.org/wiki/Cologne_phonetics): Information about the Kölner Phonetik algorithm
     - [German NLP Resources](https://github.com/adbar/German-NLP): A curated list of open-access resources for German NLP
     ### Benchmarks and Evaluation
     - [MTEB Leaderboard](https://huggingface.co/spaces/mteb/leaderboard): Massive Text Embedding Benchmark leaderboard
     - [GLUE Benchmark](https://gluebenchmark.com/): General Language Understanding Evaluation benchmark
     ### Tools and Libraries
     - [Gensim](https://radimrehurek.com/gensim/): Topic modelling for humans
     - [Sentence-Transformers](https://www.sbert.net/): A Python framework for state-of-the-art sentence, text and image embeddings
        ### Support me
     - [Visual Crew Builder](https://visual-crew.builder.ai/): Tool for create AI systems, workflows and api. Or just a notebook.
 This tool empowers you to fine-tune your RAG system for optimal performance. Experiment with different settings, run automated tests, and use insights to create an efficient information retrieval and generation system.
             settings['lang'],
             settings['apply_preprocessing']
         )
         results, _, _, _ = search_embeddings(
             chunks,
             embedding_model,
             apply_phonetic=settings['apply_phonetic'],
             phonetic_weight=settings['phonetic_weight']
         )
         # Generate a response based on the retrieved results
         response = f"Based on the query '{message}', here are the top {settings['top_k']} relevant results:\n\n"
         for i, result in enumerate(results[:settings['top_k']]):
             response += f"{i+1}. {result['content'][:100]}...\n\n"
         return response
     with gr.Blocks() as chat_interface:
     launch_interface()
     # Uncomment the following line to launch the sample chat app
 ´´´
         """
         ["Embedding Comparison", "Tutorial", "Use Case"]
     )
+    iface.launch(debug=True, share=True)
 # Enhanced Automated Testing
+def run_automated_tests(file_path: str, query: str, model_configs: List[Dict[str, str]],
                        test_params: Dict[str, List[Any]], expected_result: Optional[str] = None,
                        model_feedback: Optional[str] = None) -> Tuple[pd.DataFrame, pd.DataFrame]:
     """
     all_results = []
     all_stats = []
     model_manager = ModelManager()
     # Create parameter grid excluding model configurations
     base_params = {k: v for k, v in test_params.items() if k not in ['model_type', 'model_name']}
     param_grid = ParameterGrid(base_params)
     # Test each model configuration with all parameter combinations
     for model_config in tqdm(model_configs, desc="Testing models"):
         model_type = model_config['type']
         model_name = model_config['name']
         for params in tqdm(param_grid, desc=f"Testing parameters for {model_type}:{model_name}"):
             try:
                 # Process files and get chunks
                     params['lang'],
                     params['apply_preprocessing']
                 )
                 # Apply vocabulary optimization if specified
                 if params['optimize_vocab']:
                     tokenizer, chunks = optimize_vocabulary(chunks)
                 # Apply query optimization if specified
                 current_query = query
                 if params['use_query_optimization']:
                         params['top_k']
                     )
                     current_query = " ".join(optimized_queries)
                 # Perform search
                 results, search_time, vector_store, raw_results = search_embeddings(
                     chunks,
                     params['apply_phonetic'],
                     params['phonetic_weight']
                 )
                 # Apply reranking if specified
                 if params['use_reranking']:
+                    reranker = pipeline("text-classification",
                                       model="cross-encoder/ms-marco-MiniLM-L-12-v2")
                     raw_results = rerank_results(raw_results, current_query, reranker)
                 # Calculate statistics
                 stats = ResultAnalyzer.calculate_statistics(
                     raw_results, search_time, vector_store, num_tokens,
                     embedding_model, current_query, params['top_k'],
                     expected_result, model_feedback
                 )
                 # Update model rankings
                 model_id = f"{model_type}:{model_name}"
                 ranking_score = calculate_model_ranking_score(stats)
                 model_manager.update_model_ranking(model_id, ranking_score, model_feedback)
                 # Add model information to stats
                 stats.update({
                     "model_type": model_type,
                     "model": f"{model_type} - {model_name}",
                     **params
                 })
                 # Format and store results
                 all_results.extend(format_results(raw_results, stats))
                 all_stats.append(stats)
             except Exception as e:
                 print(f"Error testing {model_type}:{model_name} with parameters {params}: {str(e)}")
                 continue
     return pd.DataFrame(all_results), pd.DataFrame(all_stats)
     # Helper function to calculate model ranking score
         'contains_expected': 0.3,
         'expected_result_rank': -0.2  # Negative weight because lower rank is better
     }
     score = 0.0
     for metric, weight in weights.items():
         if metric in stats and not isinstance(stats[metric], str):
             else:
                 value = float(stats[metric])
             score += weight * value
     return score
 if __name__ == "__main__":
     launch_interface()