Chris4K commited on
Commit
c38e61c
1 Parent(s): 6fa8e54

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +92 -8
app.py CHANGED
@@ -301,7 +301,7 @@ def process_files(file_path, model_type, model_name, split_strategy, chunk_size,
301
 
302
  return chunks, embedding_model, len(text.split())
303
 
304
- def search_embeddings(chunks, embedding_model, vector_store_type, search_type, query, top_k, lang='german', apply_phonetic=True, phonetic_weight=0.3):
305
  preprocessed_query = preprocess_text(query, lang) if apply_phonetic else query
306
 
307
  vector_store = get_vector_store(vector_store_type, chunks, embedding_model)
@@ -330,7 +330,9 @@ def search_embeddings(chunks, embedding_model, vector_store_type, search_type, q
330
 
331
  results_df = pd.DataFrame({
332
  'content': [doc.page_content for doc in results],
333
- 'embedding': embeddings
 
 
334
  })
335
 
336
  return results_df, end_time - start_time, vector_store, results
@@ -340,10 +342,12 @@ def search_embeddings(chunks, embedding_model, vector_store_type, search_type, q
340
  # Evaluation Metrics
341
  # ... (previous code remains the same)
342
 
343
- def calculate_statistics(results, search_time, vector_store, num_tokens, embedding_model, query, top_k):
344
  stats = {
345
  "num_results": len(results),
346
  "avg_content_length": np.mean([len(doc.page_content) for doc in results]) if results else 0,
 
 
347
  "search_time": search_time,
348
  "vector_store_size": vector_store._index.ntotal if hasattr(vector_store, '_index') else "N/A",
349
  "num_documents": len(vector_store.docstore._dict),
@@ -353,6 +357,10 @@ def calculate_statistics(results, search_time, vector_store, num_tokens, embeddi
353
  "top_k": top_k,
354
  }
355
 
 
 
 
 
356
  if len(results) > 1000:
357
  embeddings = [embedding_model.embed_query(doc.page_content) for doc in results]
358
  pairwise_similarities = np.inner(embeddings, embeddings)
@@ -373,7 +381,6 @@ def calculate_statistics(results, search_time, vector_store, num_tokens, embeddi
373
  stats["rank_correlation"] = rank_correlation
374
 
375
  return stats
376
-
377
  # Visualization
378
  def visualize_results(results_df, stats_df):
379
  fig, axs = plt.subplots(2, 2, figsize=(20, 20))
@@ -432,7 +439,7 @@ def rerank_results(results, query, reranker):
432
  return reranked_results
433
 
434
  # Main Comparison Function
435
- def compare_embeddings(file, query, embedding_models, custom_embedding_model, split_strategy, chunk_size, overlap_size, custom_separators, vector_store_type, search_type, top_k, lang='german', apply_preprocessing=True, optimize_vocab=False, apply_phonetic=True, phonetic_weight=0.3, custom_tokenizer_file=None, custom_tokenizer_model=None, custom_tokenizer_vocab_size=10000, custom_tokenizer_special_tokens=None, use_query_optimization=False, query_optimization_model="google/flan-t5-base", use_reranking=False):
436
  all_results = []
437
  all_stats = []
438
  settings = {
@@ -489,6 +496,7 @@ def compare_embeddings(file, query, embedding_models, custom_embedding_model, sp
489
  search_type,
490
  query,
491
  top_k,
 
492
  lang,
493
  apply_phonetic,
494
  phonetic_weight
@@ -500,13 +508,15 @@ def compare_embeddings(file, query, embedding_models, custom_embedding_model, sp
500
 
501
  result_embeddings = [doc.metadata.get('embedding', None) for doc in results_raw]
502
 
503
- stats = calculate_statistics(results_raw, search_time, vector_store, num_tokens, embedding_model, query, top_k)
504
  stats["model"] = f"{model_type} - {model_name}"
505
  stats.update(settings)
506
 
507
  formatted_results = format_results(results_raw, stats)
508
  for i, result in enumerate(formatted_results):
509
  result['embedding'] = result_embeddings[i]
 
 
510
 
511
  all_results.extend(formatted_results)
512
  all_stats.append(stats)
@@ -651,6 +661,7 @@ def launch_interface(share=True):
651
  with gr.Tab("Simple"):
652
  file_input = gr.File(label="Upload File (Optional)")
653
  query_input = gr.Textbox(label="Search Query")
 
654
  embedding_models_input = gr.CheckboxGroup(
655
  choices=[
656
  "HuggingFace:paraphrase-miniLM",
@@ -661,7 +672,7 @@ def launch_interface(share=True):
661
  label="Embedding Models"
662
  )
663
  top_k_input = gr.Slider(1, 10, step=1, value=5, label="Top K")
664
-
665
  with gr.Tab("Advanced"):
666
  custom_embedding_model_input = gr.Textbox(label="Custom Embedding Model (optional, format: type:name)")
667
  split_strategy_input = gr.Radio(choices=["token", "recursive"], label="Split Strategy", value="recursive")
@@ -723,7 +734,7 @@ def launch_interface(share=True):
723
  inputs=[
724
  file_input, query_input, embedding_models_input, custom_embedding_model_input,
725
  split_strategy_input, chunk_size_input, overlap_size_input, custom_separators_input,
726
- vector_store_type_input, search_type_input, top_k_input, lang_input,
727
  apply_preprocessing_input, optimize_vocab_input, apply_phonetic_input,
728
  phonetic_weight_input, custom_tokenizer_file_input, custom_tokenizer_model_input,
729
  custom_tokenizer_vocab_size_input, custom_tokenizer_special_tokens_input,
@@ -1066,7 +1077,80 @@ def rerank_results(results, query, reranker):
1066
 
1067
  This tool empowers you to fine-tune your RAG system for optimal performance. Experiment with different settings, run automated tests, and use insights to create an efficient information retrieval and generation system.
1068
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1069
 
 
 
 
 
1070
 
1071
  """
1072
 
 
301
 
302
  return chunks, embedding_model, len(text.split())
303
 
304
+ def search_embeddings(chunks, embedding_model, vector_store_type, search_type, query, top_k, expected_result=None, lang='german', apply_phonetic=True, phonetic_weight=0.3):
305
  preprocessed_query = preprocess_text(query, lang) if apply_phonetic else query
306
 
307
  vector_store = get_vector_store(vector_store_type, chunks, embedding_model)
 
330
 
331
  results_df = pd.DataFrame({
332
  'content': [doc.page_content for doc in results],
333
+ 'embedding': embeddings,
334
+ 'length': [len(doc.page_content) for doc in results],
335
+ 'contains_expected': [expected_result in doc.page_content if expected_result else None for doc in results]
336
  })
337
 
338
  return results_df, end_time - start_time, vector_store, results
 
342
  # Evaluation Metrics
343
  # ... (previous code remains the same)
344
 
345
+ def calculate_statistics(results, search_time, vector_store, num_tokens, embedding_model, query, top_k, expected_result=None):
346
  stats = {
347
  "num_results": len(results),
348
  "avg_content_length": np.mean([len(doc.page_content) for doc in results]) if results else 0,
349
+ "min_content_length": min([len(doc.page_content) for doc in results]) if results else 0,
350
+ "max_content_length": max([len(doc.page_content) for doc in results]) if results else 0,
351
  "search_time": search_time,
352
  "vector_store_size": vector_store._index.ntotal if hasattr(vector_store, '_index') else "N/A",
353
  "num_documents": len(vector_store.docstore._dict),
 
357
  "top_k": top_k,
358
  }
359
 
360
+ if expected_result:
361
+ stats["contains_expected"] = any(expected_result in doc.page_content for doc in results)
362
+ stats["expected_result_rank"] = next((i for i, doc in enumerate(results) if expected_result in doc.page_content), -1) + 1
363
+
364
  if len(results) > 1000:
365
  embeddings = [embedding_model.embed_query(doc.page_content) for doc in results]
366
  pairwise_similarities = np.inner(embeddings, embeddings)
 
381
  stats["rank_correlation"] = rank_correlation
382
 
383
  return stats
 
384
  # Visualization
385
  def visualize_results(results_df, stats_df):
386
  fig, axs = plt.subplots(2, 2, figsize=(20, 20))
 
439
  return reranked_results
440
 
441
  # Main Comparison Function
442
+ def compare_embeddings(file, query, embedding_models, custom_embedding_model, split_strategy, chunk_size, overlap_size, custom_separators, vector_store_type, search_type, top_k, expected_result=None, lang='german', apply_preprocessing=True, optimize_vocab=False, apply_phonetic=True, phonetic_weight=0.3, custom_tokenizer_file=None, custom_tokenizer_model=None, custom_tokenizer_vocab_size=10000, custom_tokenizer_special_tokens=None, use_query_optimization=False, query_optimization_model="google/flan-t5-base", use_reranking=False):
443
  all_results = []
444
  all_stats = []
445
  settings = {
 
496
  search_type,
497
  query,
498
  top_k,
499
+ expected_result,
500
  lang,
501
  apply_phonetic,
502
  phonetic_weight
 
508
 
509
  result_embeddings = [doc.metadata.get('embedding', None) for doc in results_raw]
510
 
511
+ stats = calculate_statistics(results_raw, search_time, vector_store, num_tokens, embedding_model, query, top_k, expected_result)
512
  stats["model"] = f"{model_type} - {model_name}"
513
  stats.update(settings)
514
 
515
  formatted_results = format_results(results_raw, stats)
516
  for i, result in enumerate(formatted_results):
517
  result['embedding'] = result_embeddings[i]
518
+ result['length'] = len(result['Content'])
519
+ result['contains_expected'] = expected_result in result['Content'] if expected_result else None
520
 
521
  all_results.extend(formatted_results)
522
  all_stats.append(stats)
 
661
  with gr.Tab("Simple"):
662
  file_input = gr.File(label="Upload File (Optional)")
663
  query_input = gr.Textbox(label="Search Query")
664
+ expected_result_input = gr.Textbox(label="Expected Result (Optional)")
665
  embedding_models_input = gr.CheckboxGroup(
666
  choices=[
667
  "HuggingFace:paraphrase-miniLM",
 
672
  label="Embedding Models"
673
  )
674
  top_k_input = gr.Slider(1, 10, step=1, value=5, label="Top K")
675
+
676
  with gr.Tab("Advanced"):
677
  custom_embedding_model_input = gr.Textbox(label="Custom Embedding Model (optional, format: type:name)")
678
  split_strategy_input = gr.Radio(choices=["token", "recursive"], label="Split Strategy", value="recursive")
 
734
  inputs=[
735
  file_input, query_input, embedding_models_input, custom_embedding_model_input,
736
  split_strategy_input, chunk_size_input, overlap_size_input, custom_separators_input,
737
+ vector_store_type_input, search_type_input, top_k_input, expected_result_input, lang_input,
738
  apply_preprocessing_input, optimize_vocab_input, apply_phonetic_input,
739
  phonetic_weight_input, custom_tokenizer_file_input, custom_tokenizer_model_input,
740
  custom_tokenizer_vocab_size_input, custom_tokenizer_special_tokens_input,
 
1077
 
1078
  This tool empowers you to fine-tune your RAG system for optimal performance. Experiment with different settings, run automated tests, and use insights to create an efficient information retrieval and generation system.
1079
 
1080
+ # Template
1081
+
1082
+ python
1083
+ ´´´
1084
+ # Chat App Template
1085
+ def create_chat_app(settings):
1086
+ def chat(message, history):
1087
+ # Process the message using the configured embedding model and vector store
1088
+ chunks, embedding_model, _ = process_files(
1089
+ settings['file_path'],
1090
+ settings['model_type'],
1091
+ settings['model_name'],
1092
+ settings['split_strategy'],
1093
+ settings['chunk_size'],
1094
+ settings['overlap_size'],
1095
+ settings['custom_separators'],
1096
+ settings['lang'],
1097
+ settings['apply_preprocessing']
1098
+ )
1099
+
1100
+ results, _, _, _ = search_embeddings(
1101
+ chunks,
1102
+ embedding_model,
1103
+ settings['vector_store_type'],
1104
+ settings['search_type'],
1105
+ message,
1106
+ settings['top_k'],
1107
+ lang=settings['lang'],
1108
+ apply_phonetic=settings['apply_phonetic'],
1109
+ phonetic_weight=settings['phonetic_weight']
1110
+ )
1111
+
1112
+ # Generate a response based on the retrieved results
1113
+ response = f"Based on the query '{message}', here are the top {settings['top_k']} relevant results:\n\n"
1114
+ for i, result in enumerate(results[:settings['top_k']]):
1115
+ response += f"{i+1}. {result['content'][:100]}...\n\n"
1116
+
1117
+ return response
1118
+
1119
+ with gr.Blocks() as chat_interface:
1120
+ gr.Markdown(f"# Chat App using {settings['model_type']} - {settings['model_name']}")
1121
+ chatbot = gr.Chatbot()
1122
+ msg = gr.Textbox()
1123
+ clear = gr.Button("Clear")
1124
+
1125
+ msg.submit(chat, [msg, chatbot], [msg, chatbot])
1126
+ clear.click(lambda: None, None, chatbot, queue=False)
1127
+
1128
+ return chat_interface
1129
+
1130
+ # Sample usage of the chat app template
1131
+ sample_settings = {
1132
+ 'file_path': 'path/to/your/document.pdf',
1133
+ 'model_type': 'HuggingFace',
1134
+ 'model_name': 'paraphrase-miniLM',
1135
+ 'split_strategy': 'recursive',
1136
+ 'chunk_size': 500,
1137
+ 'overlap_size': 50,
1138
+ 'custom_separators': None,
1139
+ 'vector_store_type': 'FAISS',
1140
+ 'search_type': 'similarity',
1141
+ 'top_k': 3,
1142
+ 'lang': 'english',
1143
+ 'apply_preprocessing': True,
1144
+ 'apply_phonetic': True,
1145
+ 'phonetic_weight': 0.3
1146
+ }
1147
+
1148
+ sample_chat_app = create_chat_app(sample_settings)
1149
 
1150
+ if __name__ == "__main__":
1151
+ launch_interface()
1152
+ # Uncomment the following line to launch the sample chat app
1153
+ ´´´
1154
 
1155
  """
1156