Update app.py
Browse files
app.py
CHANGED
@@ -301,7 +301,7 @@ def process_files(file_path, model_type, model_name, split_strategy, chunk_size,
|
|
301 |
|
302 |
return chunks, embedding_model, len(text.split())
|
303 |
|
304 |
-
def search_embeddings(chunks, embedding_model, vector_store_type, search_type, query, top_k, lang='german', apply_phonetic=True, phonetic_weight=0.3):
|
305 |
preprocessed_query = preprocess_text(query, lang) if apply_phonetic else query
|
306 |
|
307 |
vector_store = get_vector_store(vector_store_type, chunks, embedding_model)
|
@@ -330,7 +330,9 @@ def search_embeddings(chunks, embedding_model, vector_store_type, search_type, q
|
|
330 |
|
331 |
results_df = pd.DataFrame({
|
332 |
'content': [doc.page_content for doc in results],
|
333 |
-
'embedding': embeddings
|
|
|
|
|
334 |
})
|
335 |
|
336 |
return results_df, end_time - start_time, vector_store, results
|
@@ -340,10 +342,12 @@ def search_embeddings(chunks, embedding_model, vector_store_type, search_type, q
|
|
340 |
# Evaluation Metrics
|
341 |
# ... (previous code remains the same)
|
342 |
|
343 |
-
def calculate_statistics(results, search_time, vector_store, num_tokens, embedding_model, query, top_k):
|
344 |
stats = {
|
345 |
"num_results": len(results),
|
346 |
"avg_content_length": np.mean([len(doc.page_content) for doc in results]) if results else 0,
|
|
|
|
|
347 |
"search_time": search_time,
|
348 |
"vector_store_size": vector_store._index.ntotal if hasattr(vector_store, '_index') else "N/A",
|
349 |
"num_documents": len(vector_store.docstore._dict),
|
@@ -353,6 +357,10 @@ def calculate_statistics(results, search_time, vector_store, num_tokens, embeddi
|
|
353 |
"top_k": top_k,
|
354 |
}
|
355 |
|
|
|
|
|
|
|
|
|
356 |
if len(results) > 1000:
|
357 |
embeddings = [embedding_model.embed_query(doc.page_content) for doc in results]
|
358 |
pairwise_similarities = np.inner(embeddings, embeddings)
|
@@ -373,7 +381,6 @@ def calculate_statistics(results, search_time, vector_store, num_tokens, embeddi
|
|
373 |
stats["rank_correlation"] = rank_correlation
|
374 |
|
375 |
return stats
|
376 |
-
|
377 |
# Visualization
|
378 |
def visualize_results(results_df, stats_df):
|
379 |
fig, axs = plt.subplots(2, 2, figsize=(20, 20))
|
@@ -432,7 +439,7 @@ def rerank_results(results, query, reranker):
|
|
432 |
return reranked_results
|
433 |
|
434 |
# Main Comparison Function
|
435 |
-
def compare_embeddings(file, query, embedding_models, custom_embedding_model, split_strategy, chunk_size, overlap_size, custom_separators, vector_store_type, search_type, top_k, lang='german', apply_preprocessing=True, optimize_vocab=False, apply_phonetic=True, phonetic_weight=0.3, custom_tokenizer_file=None, custom_tokenizer_model=None, custom_tokenizer_vocab_size=10000, custom_tokenizer_special_tokens=None, use_query_optimization=False, query_optimization_model="google/flan-t5-base", use_reranking=False):
|
436 |
all_results = []
|
437 |
all_stats = []
|
438 |
settings = {
|
@@ -489,6 +496,7 @@ def compare_embeddings(file, query, embedding_models, custom_embedding_model, sp
|
|
489 |
search_type,
|
490 |
query,
|
491 |
top_k,
|
|
|
492 |
lang,
|
493 |
apply_phonetic,
|
494 |
phonetic_weight
|
@@ -500,13 +508,15 @@ def compare_embeddings(file, query, embedding_models, custom_embedding_model, sp
|
|
500 |
|
501 |
result_embeddings = [doc.metadata.get('embedding', None) for doc in results_raw]
|
502 |
|
503 |
-
stats = calculate_statistics(results_raw, search_time, vector_store, num_tokens, embedding_model, query, top_k)
|
504 |
stats["model"] = f"{model_type} - {model_name}"
|
505 |
stats.update(settings)
|
506 |
|
507 |
formatted_results = format_results(results_raw, stats)
|
508 |
for i, result in enumerate(formatted_results):
|
509 |
result['embedding'] = result_embeddings[i]
|
|
|
|
|
510 |
|
511 |
all_results.extend(formatted_results)
|
512 |
all_stats.append(stats)
|
@@ -651,6 +661,7 @@ def launch_interface(share=True):
|
|
651 |
with gr.Tab("Simple"):
|
652 |
file_input = gr.File(label="Upload File (Optional)")
|
653 |
query_input = gr.Textbox(label="Search Query")
|
|
|
654 |
embedding_models_input = gr.CheckboxGroup(
|
655 |
choices=[
|
656 |
"HuggingFace:paraphrase-miniLM",
|
@@ -661,7 +672,7 @@ def launch_interface(share=True):
|
|
661 |
label="Embedding Models"
|
662 |
)
|
663 |
top_k_input = gr.Slider(1, 10, step=1, value=5, label="Top K")
|
664 |
-
|
665 |
with gr.Tab("Advanced"):
|
666 |
custom_embedding_model_input = gr.Textbox(label="Custom Embedding Model (optional, format: type:name)")
|
667 |
split_strategy_input = gr.Radio(choices=["token", "recursive"], label="Split Strategy", value="recursive")
|
@@ -723,7 +734,7 @@ def launch_interface(share=True):
|
|
723 |
inputs=[
|
724 |
file_input, query_input, embedding_models_input, custom_embedding_model_input,
|
725 |
split_strategy_input, chunk_size_input, overlap_size_input, custom_separators_input,
|
726 |
-
vector_store_type_input, search_type_input, top_k_input, lang_input,
|
727 |
apply_preprocessing_input, optimize_vocab_input, apply_phonetic_input,
|
728 |
phonetic_weight_input, custom_tokenizer_file_input, custom_tokenizer_model_input,
|
729 |
custom_tokenizer_vocab_size_input, custom_tokenizer_special_tokens_input,
|
@@ -1066,7 +1077,80 @@ def rerank_results(results, query, reranker):
|
|
1066 |
|
1067 |
This tool empowers you to fine-tune your RAG system for optimal performance. Experiment with different settings, run automated tests, and use insights to create an efficient information retrieval and generation system.
|
1068 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1069 |
|
|
|
|
|
|
|
|
|
1070 |
|
1071 |
"""
|
1072 |
|
|
|
301 |
|
302 |
return chunks, embedding_model, len(text.split())
|
303 |
|
304 |
+
def search_embeddings(chunks, embedding_model, vector_store_type, search_type, query, top_k, expected_result=None, lang='german', apply_phonetic=True, phonetic_weight=0.3):
|
305 |
preprocessed_query = preprocess_text(query, lang) if apply_phonetic else query
|
306 |
|
307 |
vector_store = get_vector_store(vector_store_type, chunks, embedding_model)
|
|
|
330 |
|
331 |
results_df = pd.DataFrame({
|
332 |
'content': [doc.page_content for doc in results],
|
333 |
+
'embedding': embeddings,
|
334 |
+
'length': [len(doc.page_content) for doc in results],
|
335 |
+
'contains_expected': [expected_result in doc.page_content if expected_result else None for doc in results]
|
336 |
})
|
337 |
|
338 |
return results_df, end_time - start_time, vector_store, results
|
|
|
342 |
# Evaluation Metrics
|
343 |
# ... (previous code remains the same)
|
344 |
|
345 |
+
def calculate_statistics(results, search_time, vector_store, num_tokens, embedding_model, query, top_k, expected_result=None):
|
346 |
stats = {
|
347 |
"num_results": len(results),
|
348 |
"avg_content_length": np.mean([len(doc.page_content) for doc in results]) if results else 0,
|
349 |
+
"min_content_length": min([len(doc.page_content) for doc in results]) if results else 0,
|
350 |
+
"max_content_length": max([len(doc.page_content) for doc in results]) if results else 0,
|
351 |
"search_time": search_time,
|
352 |
"vector_store_size": vector_store._index.ntotal if hasattr(vector_store, '_index') else "N/A",
|
353 |
"num_documents": len(vector_store.docstore._dict),
|
|
|
357 |
"top_k": top_k,
|
358 |
}
|
359 |
|
360 |
+
if expected_result:
|
361 |
+
stats["contains_expected"] = any(expected_result in doc.page_content for doc in results)
|
362 |
+
stats["expected_result_rank"] = next((i for i, doc in enumerate(results) if expected_result in doc.page_content), -1) + 1
|
363 |
+
|
364 |
if len(results) > 1000:
|
365 |
embeddings = [embedding_model.embed_query(doc.page_content) for doc in results]
|
366 |
pairwise_similarities = np.inner(embeddings, embeddings)
|
|
|
381 |
stats["rank_correlation"] = rank_correlation
|
382 |
|
383 |
return stats
|
|
|
384 |
# Visualization
|
385 |
def visualize_results(results_df, stats_df):
|
386 |
fig, axs = plt.subplots(2, 2, figsize=(20, 20))
|
|
|
439 |
return reranked_results
|
440 |
|
441 |
# Main Comparison Function
|
442 |
+
def compare_embeddings(file, query, embedding_models, custom_embedding_model, split_strategy, chunk_size, overlap_size, custom_separators, vector_store_type, search_type, top_k, expected_result=None, lang='german', apply_preprocessing=True, optimize_vocab=False, apply_phonetic=True, phonetic_weight=0.3, custom_tokenizer_file=None, custom_tokenizer_model=None, custom_tokenizer_vocab_size=10000, custom_tokenizer_special_tokens=None, use_query_optimization=False, query_optimization_model="google/flan-t5-base", use_reranking=False):
|
443 |
all_results = []
|
444 |
all_stats = []
|
445 |
settings = {
|
|
|
496 |
search_type,
|
497 |
query,
|
498 |
top_k,
|
499 |
+
expected_result,
|
500 |
lang,
|
501 |
apply_phonetic,
|
502 |
phonetic_weight
|
|
|
508 |
|
509 |
result_embeddings = [doc.metadata.get('embedding', None) for doc in results_raw]
|
510 |
|
511 |
+
stats = calculate_statistics(results_raw, search_time, vector_store, num_tokens, embedding_model, query, top_k, expected_result)
|
512 |
stats["model"] = f"{model_type} - {model_name}"
|
513 |
stats.update(settings)
|
514 |
|
515 |
formatted_results = format_results(results_raw, stats)
|
516 |
for i, result in enumerate(formatted_results):
|
517 |
result['embedding'] = result_embeddings[i]
|
518 |
+
result['length'] = len(result['Content'])
|
519 |
+
result['contains_expected'] = expected_result in result['Content'] if expected_result else None
|
520 |
|
521 |
all_results.extend(formatted_results)
|
522 |
all_stats.append(stats)
|
|
|
661 |
with gr.Tab("Simple"):
|
662 |
file_input = gr.File(label="Upload File (Optional)")
|
663 |
query_input = gr.Textbox(label="Search Query")
|
664 |
+
expected_result_input = gr.Textbox(label="Expected Result (Optional)")
|
665 |
embedding_models_input = gr.CheckboxGroup(
|
666 |
choices=[
|
667 |
"HuggingFace:paraphrase-miniLM",
|
|
|
672 |
label="Embedding Models"
|
673 |
)
|
674 |
top_k_input = gr.Slider(1, 10, step=1, value=5, label="Top K")
|
675 |
+
|
676 |
with gr.Tab("Advanced"):
|
677 |
custom_embedding_model_input = gr.Textbox(label="Custom Embedding Model (optional, format: type:name)")
|
678 |
split_strategy_input = gr.Radio(choices=["token", "recursive"], label="Split Strategy", value="recursive")
|
|
|
734 |
inputs=[
|
735 |
file_input, query_input, embedding_models_input, custom_embedding_model_input,
|
736 |
split_strategy_input, chunk_size_input, overlap_size_input, custom_separators_input,
|
737 |
+
vector_store_type_input, search_type_input, top_k_input, expected_result_input, lang_input,
|
738 |
apply_preprocessing_input, optimize_vocab_input, apply_phonetic_input,
|
739 |
phonetic_weight_input, custom_tokenizer_file_input, custom_tokenizer_model_input,
|
740 |
custom_tokenizer_vocab_size_input, custom_tokenizer_special_tokens_input,
|
|
|
1077 |
|
1078 |
This tool empowers you to fine-tune your RAG system for optimal performance. Experiment with different settings, run automated tests, and use insights to create an efficient information retrieval and generation system.
|
1079 |
|
1080 |
+
# Template
|
1081 |
+
|
1082 |
+
python
|
1083 |
+
´´´
|
1084 |
+
# Chat App Template
|
1085 |
+
def create_chat_app(settings):
|
1086 |
+
def chat(message, history):
|
1087 |
+
# Process the message using the configured embedding model and vector store
|
1088 |
+
chunks, embedding_model, _ = process_files(
|
1089 |
+
settings['file_path'],
|
1090 |
+
settings['model_type'],
|
1091 |
+
settings['model_name'],
|
1092 |
+
settings['split_strategy'],
|
1093 |
+
settings['chunk_size'],
|
1094 |
+
settings['overlap_size'],
|
1095 |
+
settings['custom_separators'],
|
1096 |
+
settings['lang'],
|
1097 |
+
settings['apply_preprocessing']
|
1098 |
+
)
|
1099 |
+
|
1100 |
+
results, _, _, _ = search_embeddings(
|
1101 |
+
chunks,
|
1102 |
+
embedding_model,
|
1103 |
+
settings['vector_store_type'],
|
1104 |
+
settings['search_type'],
|
1105 |
+
message,
|
1106 |
+
settings['top_k'],
|
1107 |
+
lang=settings['lang'],
|
1108 |
+
apply_phonetic=settings['apply_phonetic'],
|
1109 |
+
phonetic_weight=settings['phonetic_weight']
|
1110 |
+
)
|
1111 |
+
|
1112 |
+
# Generate a response based on the retrieved results
|
1113 |
+
response = f"Based on the query '{message}', here are the top {settings['top_k']} relevant results:\n\n"
|
1114 |
+
for i, result in enumerate(results[:settings['top_k']]):
|
1115 |
+
response += f"{i+1}. {result['content'][:100]}...\n\n"
|
1116 |
+
|
1117 |
+
return response
|
1118 |
+
|
1119 |
+
with gr.Blocks() as chat_interface:
|
1120 |
+
gr.Markdown(f"# Chat App using {settings['model_type']} - {settings['model_name']}")
|
1121 |
+
chatbot = gr.Chatbot()
|
1122 |
+
msg = gr.Textbox()
|
1123 |
+
clear = gr.Button("Clear")
|
1124 |
+
|
1125 |
+
msg.submit(chat, [msg, chatbot], [msg, chatbot])
|
1126 |
+
clear.click(lambda: None, None, chatbot, queue=False)
|
1127 |
+
|
1128 |
+
return chat_interface
|
1129 |
+
|
1130 |
+
# Sample usage of the chat app template
|
1131 |
+
sample_settings = {
|
1132 |
+
'file_path': 'path/to/your/document.pdf',
|
1133 |
+
'model_type': 'HuggingFace',
|
1134 |
+
'model_name': 'paraphrase-miniLM',
|
1135 |
+
'split_strategy': 'recursive',
|
1136 |
+
'chunk_size': 500,
|
1137 |
+
'overlap_size': 50,
|
1138 |
+
'custom_separators': None,
|
1139 |
+
'vector_store_type': 'FAISS',
|
1140 |
+
'search_type': 'similarity',
|
1141 |
+
'top_k': 3,
|
1142 |
+
'lang': 'english',
|
1143 |
+
'apply_preprocessing': True,
|
1144 |
+
'apply_phonetic': True,
|
1145 |
+
'phonetic_weight': 0.3
|
1146 |
+
}
|
1147 |
+
|
1148 |
+
sample_chat_app = create_chat_app(sample_settings)
|
1149 |
|
1150 |
+
if __name__ == "__main__":
|
1151 |
+
launch_interface()
|
1152 |
+
# Uncomment the following line to launch the sample chat app
|
1153 |
+
´´´
|
1154 |
|
1155 |
"""
|
1156 |
|