Chris4K commited on
Commit
9efbb97
1 Parent(s): 4d97daa

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +88 -59
app.py CHANGED
@@ -6,36 +6,25 @@ import nltk
6
  import gradio as gr
7
  from langchain_huggingface import HuggingFaceEmbeddings
8
  from langchain_community.embeddings import (
9
- #HuggingFaceEmbeddings,
10
  OpenAIEmbeddings,
11
  CohereEmbeddings,
12
  )
13
  from langchain_openai import OpenAIEmbeddings
14
-
15
  from langchain_community.vectorstores import FAISS, Chroma
16
  from langchain_text_splitters import (
17
  RecursiveCharacterTextSplitter,
18
  TokenTextSplitter,
19
  )
20
- #from langchain.retrievers import (
21
- # VectorStoreRetriever,
22
- # ContextualCompressionRetriever,
23
- #)
24
- from langchain.retrievers.document_compressors import LLMChainExtractor
25
- from langchain_community.llms import OpenAI
26
  from typing import List, Dict, Any
27
  import pandas as pd
28
 
29
- # Ensure nltk sentence tokenizer is downloaded
30
  nltk.download('punkt', quiet=True)
31
 
32
  FILES_DIR = './files'
33
 
34
- # Supported embedding models
35
  MODELS = {
36
  'HuggingFace': {
37
- 'e5-base': "danielheinz/e5-base-sts-en-de",
38
- 'multilingual-e5-base': "multilingual-e5-base",
39
  'paraphrase-miniLM': "paraphrase-multilingual-MiniLM-L12-v2",
40
  'paraphrase-mpnet': "paraphrase-multilingual-mpnet-base-v2",
41
  'gte-large': "gte-large",
@@ -116,7 +105,6 @@ def get_retriever(vector_store, search_type, search_kwargs=None):
116
  raise ValueError(f"Unsupported search type: {search_type}")
117
 
118
  def process_files(file_path, model_type, model_name, split_strategy, chunk_size, overlap_size, custom_separators):
119
- # File processing
120
  if file_path:
121
  text = FileHandler.extract_text(file_path)
122
  else:
@@ -125,45 +113,49 @@ def process_files(file_path, model_type, model_name, split_strategy, chunk_size,
125
  file_path = os.path.join(FILES_DIR, file)
126
  text += FileHandler.extract_text(file_path)
127
 
128
- # Split text into chunks
129
  text_splitter = get_text_splitter(split_strategy, chunk_size, overlap_size, custom_separators)
130
  chunks = text_splitter.split_text(text)
131
 
132
- # Get embedding model
133
  embedding_model = get_embedding_model(model_type, model_name)
134
 
135
- return chunks, embedding_model
136
 
137
  def search_embeddings(chunks, embedding_model, vector_store_type, search_type, query, top_k):
138
- # Create vector store
139
  vector_store = get_vector_store(vector_store_type, chunks, embedding_model)
140
-
141
- # Get retriever
142
  retriever = get_retriever(vector_store, search_type, {"k": top_k})
143
 
144
- # Perform search
145
  start_time = time.time()
146
  results = retriever.get_relevant_documents(query)
147
  end_time = time.time()
148
 
149
- return results, end_time - start_time
150
 
151
- def calculate_statistics(results, search_time):
152
  return {
153
  "num_results": len(results),
154
- "avg_content_length": sum(len(doc.page_content) for doc in results) / len(results),
155
- "search_time": search_time
 
 
 
 
156
  }
157
 
158
- import gradio as gr
159
- import pandas as pd
160
-
161
  def compare_embeddings(file, query, model_types, model_names, split_strategy, chunk_size, overlap_size, custom_separators, vector_store_type, search_type, top_k):
162
  all_results = []
163
  all_stats = []
 
 
 
 
 
 
 
 
 
164
 
165
  for model_type, model_name in zip(model_types, model_names):
166
- chunks, embedding_model = process_files(
167
  file.name if file else None,
168
  model_type,
169
  model_name,
@@ -173,7 +165,7 @@ def compare_embeddings(file, query, model_types, model_names, split_strategy, ch
173
  custom_separators.split(',') if custom_separators else None
174
  )
175
 
176
- results, search_time = search_embeddings(
177
  chunks,
178
  embedding_model,
179
  vector_store_type,
@@ -182,39 +174,30 @@ def compare_embeddings(file, query, model_types, model_names, split_strategy, ch
182
  top_k
183
  )
184
 
185
- stats = calculate_statistics(results, search_time)
186
  stats["model"] = f"{model_type} - {model_name}"
 
187
 
188
- formatted_results, formatted_stats = format_results(results, stats)
189
- all_results.append(formatted_results)
190
- all_stats.append(formatted_stats)
191
 
192
- return all_results + all_stats
 
193
 
194
- def format_results(results, stats):
195
- # List to store the processed document data
196
- data = []
197
 
198
- # Extracting content and metadata from each document
 
199
  for doc in results:
200
- # Ensure metadata is a dictionary (if it's a custom object, convert it)
201
- metadata_dict = dict(doc.metadata)
202
-
203
- # Create a combined dictionary with 'Content' and all metadata fields
204
- doc_data = {"Content": doc.page_content}
205
- doc_data.update(metadata_dict) # Add all metadata key-value pairs
206
-
207
- # Append the processed document data to the list
208
- data.append(doc_data)
209
-
210
- # Convert the list of document data into a DataFrame
211
- df = pd.DataFrame(data)
212
-
213
- # Formatting stats as a DataFrame
214
- formatted_stats = pd.DataFrame([stats])
215
-
216
- return df, formatted_stats
217
-
218
 
219
  # Gradio interface
220
  iface = gr.Interface(
@@ -223,7 +206,7 @@ iface = gr.Interface(
223
  gr.File(label="Upload File (Optional)"),
224
  gr.Textbox(label="Search Query"),
225
  gr.CheckboxGroup(choices=list(MODELS.keys()), label="Embedding Model Types", value=["HuggingFace"]),
226
- gr.CheckboxGroup(choices=[model for models in MODELS.values() for model in models], label="Embedding Models", value=["e5-base"]),
227
  gr.Radio(choices=["token", "recursive"], label="Split Strategy", value="recursive"),
228
  gr.Slider(100, 1000, step=100, value=500, label="Chunk Size"),
229
  gr.Slider(0, 100, step=10, value=50, label="Overlap Size"),
@@ -237,7 +220,53 @@ iface = gr.Interface(
237
  gr.Dataframe(label="Statistics")
238
  ],
239
  title="Embedding Comparison Tool",
240
- description="Compare different embedding models and retrieval strategies"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
241
  )
242
 
243
- iface.launch()
 
6
  import gradio as gr
7
  from langchain_huggingface import HuggingFaceEmbeddings
8
  from langchain_community.embeddings import (
 
9
  OpenAIEmbeddings,
10
  CohereEmbeddings,
11
  )
12
  from langchain_openai import OpenAIEmbeddings
 
13
  from langchain_community.vectorstores import FAISS, Chroma
14
  from langchain_text_splitters import (
15
  RecursiveCharacterTextSplitter,
16
  TokenTextSplitter,
17
  )
 
 
 
 
 
 
18
  from typing import List, Dict, Any
19
  import pandas as pd
20
 
 
21
  nltk.download('punkt', quiet=True)
22
 
23
  FILES_DIR = './files'
24
 
 
25
  MODELS = {
26
  'HuggingFace': {
27
+ 'e5-base-de': "danielheinz/e5-base-sts-en-de",
 
28
  'paraphrase-miniLM': "paraphrase-multilingual-MiniLM-L12-v2",
29
  'paraphrase-mpnet': "paraphrase-multilingual-mpnet-base-v2",
30
  'gte-large': "gte-large",
 
105
  raise ValueError(f"Unsupported search type: {search_type}")
106
 
107
  def process_files(file_path, model_type, model_name, split_strategy, chunk_size, overlap_size, custom_separators):
 
108
  if file_path:
109
  text = FileHandler.extract_text(file_path)
110
  else:
 
113
  file_path = os.path.join(FILES_DIR, file)
114
  text += FileHandler.extract_text(file_path)
115
 
 
116
  text_splitter = get_text_splitter(split_strategy, chunk_size, overlap_size, custom_separators)
117
  chunks = text_splitter.split_text(text)
118
 
 
119
  embedding_model = get_embedding_model(model_type, model_name)
120
 
121
+ return chunks, embedding_model, len(text.split())
122
 
123
  def search_embeddings(chunks, embedding_model, vector_store_type, search_type, query, top_k):
 
124
  vector_store = get_vector_store(vector_store_type, chunks, embedding_model)
 
 
125
  retriever = get_retriever(vector_store, search_type, {"k": top_k})
126
 
 
127
  start_time = time.time()
128
  results = retriever.get_relevant_documents(query)
129
  end_time = time.time()
130
 
131
+ return results, end_time - start_time, vector_store
132
 
133
+ def calculate_statistics(results, search_time, vector_store, num_tokens, embedding_model):
134
  return {
135
  "num_results": len(results),
136
+ "avg_content_length": sum(len(doc.page_content) for doc in results) / len(results) if results else 0,
137
+ "search_time": search_time,
138
+ "vector_store_size": vector_store._index.ntotal if hasattr(vector_store, '_index') else "N/A",
139
+ "num_documents": len(vector_store.docstore._dict),
140
+ "num_tokens": num_tokens,
141
+ "embedding_vocab_size": embedding_model.client.get_vocab_size() if hasattr(embedding_model, 'client') and hasattr(embedding_model.client, 'get_vocab_size') else "N/A"
142
  }
143
 
 
 
 
144
  def compare_embeddings(file, query, model_types, model_names, split_strategy, chunk_size, overlap_size, custom_separators, vector_store_type, search_type, top_k):
145
  all_results = []
146
  all_stats = []
147
+ settings = {
148
+ "split_strategy": split_strategy,
149
+ "chunk_size": chunk_size,
150
+ "overlap_size": overlap_size,
151
+ "custom_separators": custom_separators,
152
+ "vector_store_type": vector_store_type,
153
+ "search_type": search_type,
154
+ "top_k": top_k
155
+ }
156
 
157
  for model_type, model_name in zip(model_types, model_names):
158
+ chunks, embedding_model, num_tokens = process_files(
159
  file.name if file else None,
160
  model_type,
161
  model_name,
 
165
  custom_separators.split(',') if custom_separators else None
166
  )
167
 
168
+ results, search_time, vector_store = search_embeddings(
169
  chunks,
170
  embedding_model,
171
  vector_store_type,
 
174
  top_k
175
  )
176
 
177
+ stats = calculate_statistics(results, search_time, vector_store, num_tokens, embedding_model)
178
  stats["model"] = f"{model_type} - {model_name}"
179
+ stats.update(settings)
180
 
181
+ formatted_results = format_results(results, stats)
182
+ all_results.extend(formatted_results)
183
+ all_stats.append(stats)
184
 
185
+ results_df = pd.DataFrame(all_results)
186
+ stats_df = pd.DataFrame(all_stats)
187
 
188
+ return results_df, stats_df
 
 
189
 
190
+ def format_results(results, stats):
191
+ formatted_results = []
192
  for doc in results:
193
+ result = {
194
+ "Content": doc.page_content,
195
+ "Model": stats["model"],
196
+ **doc.metadata,
197
+ **{k: v for k, v in stats.items() if k not in ["model"]}
198
+ }
199
+ formatted_results.append(result)
200
+ return formatted_results
 
 
 
 
 
 
 
 
 
 
201
 
202
  # Gradio interface
203
  iface = gr.Interface(
 
206
  gr.File(label="Upload File (Optional)"),
207
  gr.Textbox(label="Search Query"),
208
  gr.CheckboxGroup(choices=list(MODELS.keys()), label="Embedding Model Types", value=["HuggingFace"]),
209
+ gr.CheckboxGroup(choices=[model for models in MODELS.values() for model in models], label="Embedding Models", value=["e5-base-de"]),
210
  gr.Radio(choices=["token", "recursive"], label="Split Strategy", value="recursive"),
211
  gr.Slider(100, 1000, step=100, value=500, label="Chunk Size"),
212
  gr.Slider(0, 100, step=10, value=50, label="Overlap Size"),
 
220
  gr.Dataframe(label="Statistics")
221
  ],
222
  title="Embedding Comparison Tool",
223
+ description="Compare different embedding models and retrieval strategies",
224
+ examples=[
225
+ ["example.pdf", "What is machine learning?", ["HuggingFace"], ["e5-base-de"], "recursive", 500, 50, "", "FAISS", "similarity", 5]
226
+ ],
227
+ allow_flagging="never"
228
+ )
229
+
230
+ tutorial_md = """
231
+ # Embedding Comparison Tool Tutorial
232
+
233
+ This tool allows you to compare different embedding models and retrieval strategies for document search. Here's how to use it:
234
+
235
+ 1. **File Upload**: Optionally upload a file (PDF, DOCX, or TXT) or leave it empty to use files in the `./files` directory.
236
+
237
+ 2. **Search Query**: Enter the search query you want to use for retrieving relevant documents.
238
+
239
+ 3. **Embedding Model Types**: Select one or more embedding model types (HuggingFace, OpenAI, Cohere).
240
+
241
+ 4. **Embedding Models**: Choose specific models for each selected model type.
242
+
243
+ 5. **Split Strategy**: Select either 'token' or 'recursive' for text splitting.
244
+
245
+ 6. **Chunk Size**: Set the size of text chunks (100-1000).
246
+
247
+ 7. **Overlap Size**: Set the overlap between chunks (0-100).
248
+
249
+ 8. **Custom Split Separators**: Optionally enter custom separators for text splitting.
250
+
251
+ 9. **Vector Store Type**: Choose between FAISS and Chroma for storing vectors.
252
+
253
+ 10. **Search Type**: Select 'similarity' or 'mmr' (Maximum Marginal Relevance) search.
254
+
255
+ 11. **Top K**: Set the number of top results to retrieve (1-10).
256
+
257
+ After setting these parameters, click "Submit" to run the comparison. The results will be displayed in two tables:
258
+
259
+ - **Results**: Shows the retrieved document contents and metadata for each model.
260
+ - **Statistics**: Provides performance metrics and settings for each model.
261
+
262
+ You can download the results as CSV files for further analysis.
263
+
264
+ Experiment with different settings to find the best combination for your specific use case!
265
+ """
266
+
267
+ iface = gr.TabbedInterface(
268
+ [iface, gr.Markdown(tutorial_md)],
269
+ ["Embedding Comparison", "Tutorial"]
270
  )
271
 
272
+ iface.launch(share=True)