More_Advanced_Embeddings_Comparator

Running

App Files Files Community

Chris4K commited on Oct 22

Commit

975a7fc

•

1 Parent(s): 4b5f1bf

Update app.py

Browse files

Files changed (1) hide show

app.py +22 -50

app.py CHANGED Viewed

@@ -45,7 +45,7 @@ login(token=hf_token)
 # Define the model pipeline with additional generation parameters
 model_pipeline = pipeline(
    # model="meta-llama/Llama-3.2-1B",
-    model="sentence-transformers/all-MiniLM-L6-v2",
     #use_auth_token=hf_token,
     max_length=1000,  # You can increase this if needed
     max_new_tokens=500  # Limit how many tokens are generated
@@ -510,9 +510,10 @@ import numpy as np
 from transformers import TextClassificationPipeline
 from typing import List, Union, Any
-import numpy as np
-from transformers import pipeline, TextClassificationPipeline
-from typing import List, Any, Union
 def rerank_results(
     results: List[Any],
@@ -520,56 +521,27 @@ def rerank_results(
     reranker: Union[TextClassificationPipeline, Any]
 ) -> List[Any]:
     """
-    Rerank search results using either a TextClassificationPipeline or a custom reranker.
-    Args:
-        results: List of documents/results to rerank
-        query: Search query string
-        reranker: Either a HuggingFace TextClassificationPipeline or a custom reranker
-                 with a rerank() method.
-    Returns:
-        List of reranked results
     """
     if not results:
         return results
-    if not hasattr(reranker, 'rerank'):
-        # For TextClassificationPipeline
-        try:
-            # Create pairs of query and document content
-            pairs = [[query, doc.page_content] for doc in results]
-            # Get predictions from the reranker pipeline
-            predictions = reranker(pairs)
-            # Extract scores with proper fallback options
-            scores = []
-            for pred in predictions:
-                if isinstance(pred, dict):
-                    score = pred.get('score',
-                           pred.get('probability',
-                           pred.get('confidence', 0.0)))
-                else:
-                    score = float(pred)
-                scores.append(score)
-            # Sort the results based on scores in descending order
-            reranked_idx = np.argsort(scores)[::-1]
-            # Return reranked results based on the sorted indices
-            return [results[i] for i in reranked_idx]
-        except Exception as e:
-            print(f"Warning: Reranking failed with error: {str(e)}")
-            return results
-    else:
-        # For custom rerankers with a dedicated rerank method
-        try:
-            return reranker.rerank(query, [doc.page_content for doc in results])
-        except Exception as e:
-            print(f"Warning: Custom reranking failed with error: {str(e)}")
-            return results
 # Main Comparison Function
 def compare_embeddings(file, query, embedding_models, custom_embedding_model, split_strategy, chunk_size, overlap_size, custom_separators, vector_store_type, search_type, top_k, expected_result=None, lang='german', apply_preprocessing=True, optimize_vocab=False, apply_phonetic=True, phonetic_weight=0.3, custom_tokenizer_file=None, custom_tokenizer_model=None, custom_tokenizer_vocab_size=10000, custom_tokenizer_special_tokens=None, use_query_optimization=False, query_optimization_model="google/flan-t5-base", use_reranking=False):

 # Define the model pipeline with additional generation parameters
 model_pipeline = pipeline(
    # model="meta-llama/Llama-3.2-1B",
+    model="meta-llama/Llama-3.2-1B",
     #use_auth_token=hf_token,
     max_length=1000,  # You can increase this if needed
     max_new_tokens=500  # Limit how many tokens are generated
 from transformers import TextClassificationPipeline
 from typing import List, Union, Any
+model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
 def rerank_results(
     results: List[Any],
     reranker: Union[TextClassificationPipeline, Any]
 ) -> List[Any]:
     """
     """
     if not results:
         return results
+    # Step 1: Encode the query and documents using SentenceTransformer
+    query_embedding = model.encode(query, convert_to_tensor=True)
+    doc_contents = [doc.page_content for doc in results]  # Assuming each result has a `page_content` attribute
+    doc_embeddings = model.encode(doc_contents, convert_to_tensor=True)
+    # Step 2: Compute cosine similarities between query and document embeddings
+    cosine_scores = util.cos_sim(query_embedding, doc_embeddings)[0]  # Shape: (number of documents,)
+    # Step 3: Sort documents by similarity score in descending order
+    reranked_idx = np.argsort(cosine_scores.numpy())[::-1]
+    # Step 4: Return the reranked documents
+    reranked_results = [results[i] for i in reranked_idx]
+    return reranked_results
 # Main Comparison Function
 def compare_embeddings(file, query, embedding_models, custom_embedding_model, split_strategy, chunk_size, overlap_size, custom_separators, vector_store_type, search_type, top_k, expected_result=None, lang='german', apply_preprocessing=True, optimize_vocab=False, apply_phonetic=True, phonetic_weight=0.3, custom_tokenizer_file=None, custom_tokenizer_model=None, custom_tokenizer_vocab_size=10000, custom_tokenizer_special_tokens=None, use_query_optimization=False, query_optimization_model="google/flan-t5-base", use_reranking=False):