File size: 7,998 Bytes
ccb8edf
 
 
 
 
 
6b52825
84cb849
 
 
 
2618588
 
 
 
84cb849
 
 
 
 
 
 
 
 
 
 
8d0bee3
ccb8edf
84cb849
ccb8edf
 
 
 
 
84cb849
 
 
 
 
 
 
 
 
 
 
 
 
 
ccb8edf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84cb849
 
 
 
 
 
 
 
 
ccb8edf
84cb849
80f5976
84cb849
 
 
 
 
 
 
ccb8edf
84cb849
ccb8edf
84cb849
 
 
 
 
 
 
ccb8edf
84cb849
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e267239
84cb849
ccb8edf
84cb849
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8ecfa32
ccb8edf
 
84cb849
ccb8edf
84cb849
b68ed39
84cb849
 
 
 
 
 
 
 
 
ccb8edf
84cb849
 
 
ccb8edf
 
84cb849
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
import os
import time
import pdfplumber
import docx
import nltk
import gradio as gr
from langchain_community.embeddings import (
    HuggingFaceEmbeddings,
    OpenAIEmbeddings,
    CohereEmbeddings,
)
from langchain_openai import OpenAIEmbeddings

from langchain_community.vectorstores import FAISS, Chroma
from langchain_text_splitters import (
    RecursiveCharacterTextSplitter,
    TokenTextSplitter,
)
from langchain.retrievers import (
    VectorStoreRetriever,
    ContextualCompressionRetriever,
)
from langchain.retrievers.document_compressors import LLMChainExtractor
from langchain.llms import OpenAI
from typing import List, Dict, Any
import pandas as pd

# Ensure nltk sentence tokenizer is downloaded
nltk.download('punkt', quiet=True)

FILES_DIR = './files'

# Supported embedding models
MODELS = {
    'HuggingFace': {
        'e5-base': "danielheinz/e5-base-sts-en-de",
        'multilingual-e5-base': "multilingual-e5-base",
        'paraphrase-miniLM': "paraphrase-multilingual-MiniLM-L12-v2",
        'paraphrase-mpnet': "paraphrase-multilingual-mpnet-base-v2",
        'gte-large': "gte-large",
        'gbert-base': "gbert-base"
    },
    'OpenAI': {
        'text-embedding-ada-002': "text-embedding-ada-002"
    },
    'Cohere': {
        'embed-multilingual-v2.0': "embed-multilingual-v2.0"
    }
}

class FileHandler:
    @staticmethod
    def extract_text(file_path):
        ext = os.path.splitext(file_path)[-1].lower()
        if ext == '.pdf':
            return FileHandler._extract_from_pdf(file_path)
        elif ext == '.docx':
            return FileHandler._extract_from_docx(file_path)
        elif ext == '.txt':
            return FileHandler._extract_from_txt(file_path)
        else:
            raise ValueError(f"Unsupported file type: {ext}")

    @staticmethod
    def _extract_from_pdf(file_path):
        with pdfplumber.open(file_path) as pdf:
            return ' '.join([page.extract_text() for page in pdf.pages])

    @staticmethod
    def _extract_from_docx(file_path):
        doc = docx.Document(file_path)
        return ' '.join([para.text for para in doc.paragraphs])

    @staticmethod
    def _extract_from_txt(file_path):
        with open(file_path, 'r', encoding='utf-8') as f:
            return f.read()

def get_embedding_model(model_type, model_name):
    if model_type == 'HuggingFace':
        return HuggingFaceEmbeddings(model_name=MODELS[model_type][model_name])
    elif model_type == 'OpenAI':
        return OpenAIEmbeddings(model=MODELS[model_type][model_name])
    elif model_type == 'Cohere':
        return CohereEmbeddings(model=MODELS[model_type][model_name])
    else:
        raise ValueError(f"Unsupported model type: {model_type}")

def get_text_splitter(split_strategy, chunk_size, overlap_size, custom_separators=None):
    if split_strategy == 'token':
        return TokenTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap_size)
    elif split_strategy == 'recursive':
        return RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=overlap_size,
            separators=custom_separators or ["\n\n", "\n", " ", ""]
        )
    else:
        raise ValueError(f"Unsupported split strategy: {split_strategy}")

def get_vector_store(store_type, texts, embedding_model):
    if store_type == 'FAISS':
        return FAISS.from_texts(texts, embedding_model)
    elif store_type == 'Chroma':
        return Chroma.from_texts(texts, embedding_model)
    else:
        raise ValueError(f"Unsupported vector store type: {store_type}")

def get_retriever(vector_store, search_type, search_kwargs=None):
    if search_type == 'similarity':
        return vector_store.as_retriever(search_type="similarity", search_kwargs=search_kwargs)
    elif search_type == 'mmr':
        return vector_store.as_retriever(search_type="mmr", search_kwargs=search_kwargs)
    else:
        raise ValueError(f"Unsupported search type: {search_type}")

def process_files(file_path, model_type, model_name, split_strategy, chunk_size, overlap_size, custom_separators):
    # File processing
    if file_path:
        text = FileHandler.extract_text(file_path)
    else:
        text = ""
        for file in os.listdir(FILES_DIR):
            file_path = os.path.join(FILES_DIR, file)
            text += FileHandler.extract_text(file_path)

    # Split text into chunks
    text_splitter = get_text_splitter(split_strategy, chunk_size, overlap_size, custom_separators)
    chunks = text_splitter.split_text(text)

    # Get embedding model
    embedding_model = get_embedding_model(model_type, model_name)

    return chunks, embedding_model

def search_embeddings(chunks, embedding_model, vector_store_type, search_type, query, top_k):
    # Create vector store
    vector_store = get_vector_store(vector_store_type, chunks, embedding_model)

    # Get retriever
    retriever = get_retriever(vector_store, search_type, {"k": top_k})

    # Perform search
    start_time = time.time()
    results = retriever.get_relevant_documents(query)
    end_time = time.time()

    return results, end_time - start_time

def calculate_statistics(results, search_time):
    return {
        "num_results": len(results),
        "avg_content_length": sum(len(doc.page_content) for doc in results) / len(results),
        "search_time": search_time
    }

def format_results(results, stats):
    df = pd.DataFrame([
        {
            "Content": doc.page_content,
            "Source": doc.metadata.get("source", "Unknown"),
            "Relevance Score": doc.metadata.get("score", "N/A")
        } for doc in results
    ])
    
    formatted_stats = pd.DataFrame([stats])
    
    return gr.DataFrame(df), gr.DataFrame(formatted_stats)

def compare_embeddings(file, query, model_types, model_names, split_strategy, chunk_size, overlap_size, custom_separators, vector_store_type, search_type, top_k):
    all_results = []
    all_stats = []

    for model_type, model_name in zip(model_types, model_names):
        chunks, embedding_model = process_files(
            file.name if file else None,
            model_type,
            model_name,
            split_strategy,
            chunk_size,
            overlap_size,
            custom_separators.split(',') if custom_separators else None
        )

        results, search_time = search_embeddings(
            chunks,
            embedding_model,
            vector_store_type,
            search_type,
            query,
            top_k
        )

        stats = calculate_statistics(results, search_time)
        stats["model"] = f"{model_type} - {model_name}"

        all_results.append(results)
        all_stats.append(stats)

    return [format_results(results, stats) for results, stats in zip(all_results, all_stats)]

# Gradio interface
iface = gr.Interface(
    fn=compare_embeddings,
    inputs=[
        gr.File(label="Upload File (Optional)"),
        gr.Textbox(label="Search Query"),
        gr.CheckboxGroup(choices=list(MODELS.keys()), label="Embedding Model Types", value=["HuggingFace"]),
        gr.CheckboxGroup(choices=[model for models in MODELS.values() for model in models], label="Embedding Models", value=["e5-base"]),
        gr.Radio(choices=["token", "recursive"], label="Split Strategy", value="recursive"),
        gr.Slider(100, 1000, step=100, value=500, label="Chunk Size"),
        gr.Slider(0, 100, step=10, value=50, label="Overlap Size"),
        gr.Textbox(label="Custom Split Separators (comma-separated, optional)"),
        gr.Radio(choices=["FAISS", "Chroma"], label="Vector Store Type", value="FAISS"),
        gr.Radio(choices=["similarity", "mmr"], label="Search Type", value="similarity"),
        gr.Slider(1, 10, step=1, value=5, label="Top K")
    ],
    outputs=[gr.DataFrame(label="Results"), gr.DataFrame(label="Statistics")] * len(MODELS),
    title="Embedding Comparison Tool",
    description="Compare different embedding models and retrieval strategies"
)

iface.launch()