More_Advanced_Embeddings_Comparator

Sleeping

App Files Files Community

Chris4K commited on Oct 17, 2024

Commit

9efbb97

verified ·

1 Parent(s): 4d97daa

Update app.py

Browse files

Files changed (1) hide show

app.py +88 -59

app.py CHANGED Viewed

@@ -6,36 +6,25 @@ import nltk
 import gradio as gr
 from langchain_huggingface import HuggingFaceEmbeddings
 from langchain_community.embeddings import (
-    #HuggingFaceEmbeddings,
     OpenAIEmbeddings,
     CohereEmbeddings,
 )
 from langchain_openai import OpenAIEmbeddings
 from langchain_community.vectorstores import FAISS, Chroma
 from langchain_text_splitters import (
     RecursiveCharacterTextSplitter,
     TokenTextSplitter,
 )
-#from langchain.retrievers import (
-#    VectorStoreRetriever,
-#    ContextualCompressionRetriever,
-#)
-from langchain.retrievers.document_compressors import LLMChainExtractor
-from langchain_community.llms import OpenAI
 from typing import List, Dict, Any
 import pandas as pd
-# Ensure nltk sentence tokenizer is downloaded
 nltk.download('punkt', quiet=True)
 FILES_DIR = './files'
-# Supported embedding models
 MODELS = {
     'HuggingFace': {
-        'e5-base': "danielheinz/e5-base-sts-en-de",
-        'multilingual-e5-base': "multilingual-e5-base",
         'paraphrase-miniLM': "paraphrase-multilingual-MiniLM-L12-v2",
         'paraphrase-mpnet': "paraphrase-multilingual-mpnet-base-v2",
         'gte-large': "gte-large",
@@ -116,7 +105,6 @@ def get_retriever(vector_store, search_type, search_kwargs=None):
         raise ValueError(f"Unsupported search type: {search_type}")
 def process_files(file_path, model_type, model_name, split_strategy, chunk_size, overlap_size, custom_separators):
-    # File processing
     if file_path:
         text = FileHandler.extract_text(file_path)
     else:
@@ -125,45 +113,49 @@ def process_files(file_path, model_type, model_name, split_strategy, chunk_size,
             file_path = os.path.join(FILES_DIR, file)
             text += FileHandler.extract_text(file_path)
-    # Split text into chunks
     text_splitter = get_text_splitter(split_strategy, chunk_size, overlap_size, custom_separators)
     chunks = text_splitter.split_text(text)
-    # Get embedding model
     embedding_model = get_embedding_model(model_type, model_name)
-    return chunks, embedding_model
 def search_embeddings(chunks, embedding_model, vector_store_type, search_type, query, top_k):
-    # Create vector store
     vector_store = get_vector_store(vector_store_type, chunks, embedding_model)
-    # Get retriever
     retriever = get_retriever(vector_store, search_type, {"k": top_k})
-    # Perform search
     start_time = time.time()
     results = retriever.get_relevant_documents(query)
     end_time = time.time()
-    return results, end_time - start_time
-def calculate_statistics(results, search_time):
     return {
         "num_results": len(results),
-        "avg_content_length": sum(len(doc.page_content) for doc in results) / len(results),
-        "search_time": search_time
     }
-import gradio as gr
-import pandas as pd
 def compare_embeddings(file, query, model_types, model_names, split_strategy, chunk_size, overlap_size, custom_separators, vector_store_type, search_type, top_k):
     all_results = []
     all_stats = []
     for model_type, model_name in zip(model_types, model_names):
-        chunks, embedding_model = process_files(
             file.name if file else None,
             model_type,
             model_name,
@@ -173,7 +165,7 @@ def compare_embeddings(file, query, model_types, model_names, split_strategy, ch
             custom_separators.split(',') if custom_separators else None
         )
-        results, search_time = search_embeddings(
             chunks,
             embedding_model,
             vector_store_type,
@@ -182,39 +174,30 @@ def compare_embeddings(file, query, model_types, model_names, split_strategy, ch
             top_k
         )
-        stats = calculate_statistics(results, search_time)
         stats["model"] = f"{model_type} - {model_name}"
-        formatted_results, formatted_stats = format_results(results, stats)
-        all_results.append(formatted_results)
-        all_stats.append(formatted_stats)
-    return all_results + all_stats
-def format_results(results, stats):
-    # List to store the processed document data
-    data = []
-    # Extracting content and metadata from each document
     for doc in results:
-        # Ensure metadata is a dictionary (if it's a custom object, convert it)
-        metadata_dict = dict(doc.metadata)
-        # Create a combined dictionary with 'Content' and all metadata fields
-        doc_data = {"Content": doc.page_content}
-        doc_data.update(metadata_dict)  # Add all metadata key-value pairs
-        # Append the processed document data to the list
-        data.append(doc_data)
-    # Convert the list of document data into a DataFrame
-    df = pd.DataFrame(data)
-    # Formatting stats as a DataFrame
-    formatted_stats = pd.DataFrame([stats])
-    return df, formatted_stats
 # Gradio interface
 iface = gr.Interface(
@@ -223,7 +206,7 @@ iface = gr.Interface(
         gr.File(label="Upload File (Optional)"),
         gr.Textbox(label="Search Query"),
         gr.CheckboxGroup(choices=list(MODELS.keys()), label="Embedding Model Types", value=["HuggingFace"]),
-        gr.CheckboxGroup(choices=[model for models in MODELS.values() for model in models], label="Embedding Models", value=["e5-base"]),
         gr.Radio(choices=["token", "recursive"], label="Split Strategy", value="recursive"),
         gr.Slider(100, 1000, step=100, value=500, label="Chunk Size"),
         gr.Slider(0, 100, step=10, value=50, label="Overlap Size"),
@@ -237,7 +220,53 @@ iface = gr.Interface(
         gr.Dataframe(label="Statistics")
     ],
     title="Embedding Comparison Tool",
-    description="Compare different embedding models and retrieval strategies"
 )
-iface.launch()

 import gradio as gr
 from langchain_huggingface import HuggingFaceEmbeddings
 from langchain_community.embeddings import (
     OpenAIEmbeddings,
     CohereEmbeddings,
 )
 from langchain_openai import OpenAIEmbeddings
 from langchain_community.vectorstores import FAISS, Chroma
 from langchain_text_splitters import (
     RecursiveCharacterTextSplitter,
     TokenTextSplitter,
 )
 from typing import List, Dict, Any
 import pandas as pd
 nltk.download('punkt', quiet=True)
 FILES_DIR = './files'
 MODELS = {
     'HuggingFace': {
+        'e5-base-de': "danielheinz/e5-base-sts-en-de",
         'paraphrase-miniLM': "paraphrase-multilingual-MiniLM-L12-v2",
         'paraphrase-mpnet': "paraphrase-multilingual-mpnet-base-v2",
         'gte-large': "gte-large",
         raise ValueError(f"Unsupported search type: {search_type}")
 def process_files(file_path, model_type, model_name, split_strategy, chunk_size, overlap_size, custom_separators):
     if file_path:
         text = FileHandler.extract_text(file_path)
     else:
             file_path = os.path.join(FILES_DIR, file)
             text += FileHandler.extract_text(file_path)
     text_splitter = get_text_splitter(split_strategy, chunk_size, overlap_size, custom_separators)
     chunks = text_splitter.split_text(text)
     embedding_model = get_embedding_model(model_type, model_name)
+    return chunks, embedding_model, len(text.split())
 def search_embeddings(chunks, embedding_model, vector_store_type, search_type, query, top_k):
     vector_store = get_vector_store(vector_store_type, chunks, embedding_model)
     retriever = get_retriever(vector_store, search_type, {"k": top_k})
     start_time = time.time()
     results = retriever.get_relevant_documents(query)
     end_time = time.time()
+    return results, end_time - start_time, vector_store
+def calculate_statistics(results, search_time, vector_store, num_tokens, embedding_model):
     return {
         "num_results": len(results),
+        "avg_content_length": sum(len(doc.page_content) for doc in results) / len(results) if results else 0,
+        "search_time": search_time,
+        "vector_store_size": vector_store._index.ntotal if hasattr(vector_store, '_index') else "N/A",
+        "num_documents": len(vector_store.docstore._dict),
+        "num_tokens": num_tokens,
+        "embedding_vocab_size": embedding_model.client.get_vocab_size() if hasattr(embedding_model, 'client') and hasattr(embedding_model.client, 'get_vocab_size') else "N/A"
     }
 def compare_embeddings(file, query, model_types, model_names, split_strategy, chunk_size, overlap_size, custom_separators, vector_store_type, search_type, top_k):
     all_results = []
     all_stats = []
+    settings = {
+        "split_strategy": split_strategy,
+        "chunk_size": chunk_size,
+        "overlap_size": overlap_size,
+        "custom_separators": custom_separators,
+        "vector_store_type": vector_store_type,
+        "search_type": search_type,
+        "top_k": top_k
+    }
     for model_type, model_name in zip(model_types, model_names):
+        chunks, embedding_model, num_tokens = process_files(
             file.name if file else None,
             model_type,
             model_name,
             custom_separators.split(',') if custom_separators else None
         )
+        results, search_time, vector_store = search_embeddings(
             chunks,
             embedding_model,
             vector_store_type,
             top_k
         )
+        stats = calculate_statistics(results, search_time, vector_store, num_tokens, embedding_model)
         stats["model"] = f"{model_type} - {model_name}"
+        stats.update(settings)
+        formatted_results = format_results(results, stats)
+        all_results.extend(formatted_results)
+        all_stats.append(stats)
+    results_df = pd.DataFrame(all_results)
+    stats_df = pd.DataFrame(all_stats)
+    return results_df, stats_df
+def format_results(results, stats):
+    formatted_results = []
     for doc in results:
+        result = {
+            "Content": doc.page_content,
+            "Model": stats["model"],
+            **doc.metadata,
+            **{k: v for k, v in stats.items() if k not in ["model"]}
+        }
+        formatted_results.append(result)
+    return formatted_results
 # Gradio interface
 iface = gr.Interface(
         gr.File(label="Upload File (Optional)"),
         gr.Textbox(label="Search Query"),
         gr.CheckboxGroup(choices=list(MODELS.keys()), label="Embedding Model Types", value=["HuggingFace"]),
+        gr.CheckboxGroup(choices=[model for models in MODELS.values() for model in models], label="Embedding Models", value=["e5-base-de"]),
         gr.Radio(choices=["token", "recursive"], label="Split Strategy", value="recursive"),
         gr.Slider(100, 1000, step=100, value=500, label="Chunk Size"),
         gr.Slider(0, 100, step=10, value=50, label="Overlap Size"),
         gr.Dataframe(label="Statistics")
     ],
     title="Embedding Comparison Tool",
+    description="Compare different embedding models and retrieval strategies",
+    examples=[
+        ["example.pdf", "What is machine learning?", ["HuggingFace"], ["e5-base-de"], "recursive", 500, 50, "", "FAISS", "similarity", 5]
+    ],
+    allow_flagging="never"
+)
+tutorial_md = """
+# Embedding Comparison Tool Tutorial
+This tool allows you to compare different embedding models and retrieval strategies for document search. Here's how to use it:
+1. **File Upload**: Optionally upload a file (PDF, DOCX, or TXT) or leave it empty to use files in the `./files` directory.
+2. **Search Query**: Enter the search query you want to use for retrieving relevant documents.
+3. **Embedding Model Types**: Select one or more embedding model types (HuggingFace, OpenAI, Cohere).
+4. **Embedding Models**: Choose specific models for each selected model type.
+5. **Split Strategy**: Select either 'token' or 'recursive' for text splitting.
+6. **Chunk Size**: Set the size of text chunks (100-1000).
+7. **Overlap Size**: Set the overlap between chunks (0-100).
+8. **Custom Split Separators**: Optionally enter custom separators for text splitting.
+9. **Vector Store Type**: Choose between FAISS and Chroma for storing vectors.
+10. **Search Type**: Select 'similarity' or 'mmr' (Maximum Marginal Relevance) search.
+11. **Top K**: Set the number of top results to retrieve (1-10).
+After setting these parameters, click "Submit" to run the comparison. The results will be displayed in two tables:
+- **Results**: Shows the retrieved document contents and metadata for each model.
+- **Statistics**: Provides performance metrics and settings for each model.
+You can download the results as CSV files for further analysis.
+Experiment with different settings to find the best combination for your specific use case!
+"""
+iface = gr.TabbedInterface(
+    [iface, gr.Markdown(tutorial_md)],
+    ["Embedding Comparison", "Tutorial"]
 )
+iface.launch(share=True)