Chris4K commited on
Commit
ea0ce95
1 Parent(s): 950a593

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +43 -14
app.py CHANGED
@@ -116,7 +116,10 @@ class FileHandler:
116
  def simple_tokenize(text):
117
  return text.split()
118
 
119
- def preprocess_text(text, lang='german'):
 
 
 
120
  text = text.lower()
121
  text = re.sub(r'[^a-zA-Z\s]', '', text)
122
 
@@ -141,13 +144,29 @@ def preprocess_text(text, lang='german'):
141
 
142
  return ' '.join(tokens)
143
 
144
- def phonetic_match(text, query, method='levenshtein_distance'):
 
 
145
  if method == 'levenshtein_distance':
146
  text_phonetic = jellyfish.soundex(text)
147
  query_phonetic = jellyfish.soundex(query)
148
  return jellyfish.levenshtein_distance(text_phonetic, query_phonetic)
149
  return 0
150
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
151
  def create_custom_embedding(texts, model_type='word2vec', vector_size=100, window=5, min_count=1):
152
  tokenized_texts = [text.split() for text in texts]
153
 
@@ -399,7 +418,7 @@ def rerank_results(results, query, reranker):
399
  return reranked_results
400
 
401
  # Main Comparison Function
402
- def compare_embeddings(file, query, embedding_models, custom_embedding_model, split_strategy, chunk_size, overlap_size, custom_separators, vector_store_type, search_type, top_k, lang='german', optimize_vocab=False, phonetic_weight=0.3, custom_tokenizer_file=None, custom_tokenizer_model=None, custom_tokenizer_vocab_size=10000, custom_tokenizer_special_tokens=None, use_query_optimization=False, use_reranking=False):
403
  all_results = []
404
  all_stats = []
405
  settings = {
@@ -431,6 +450,7 @@ def compare_embeddings(file, query, embedding_models, custom_embedding_model, sp
431
  overlap_size,
432
  custom_separators.split(',') if custom_separators else None,
433
  lang,
 
434
  custom_tokenizer_file,
435
  custom_tokenizer_model,
436
  int(custom_tokenizer_vocab_size),
@@ -442,12 +462,7 @@ def compare_embeddings(file, query, embedding_models, custom_embedding_model, sp
442
  chunks = optimized_chunks
443
 
444
  if use_query_optimization:
445
- llm = HuggingFacePipeline.from_model_id(
446
- model_id="google/flan-t5-base",
447
- task="text2text-generation",
448
- model_kwargs={"temperature": 0, "max_length": 64},
449
- )
450
- optimized_queries = optimize_query(query, llm)
451
  query = " ".join(optimized_queries)
452
 
453
  results, search_time, vector_store, results_raw = search_embeddings(
@@ -458,9 +473,10 @@ def compare_embeddings(file, query, embedding_models, custom_embedding_model, sp
458
  query,
459
  top_k,
460
  lang,
 
461
  phonetic_weight
462
  )
463
-
464
  if use_reranking:
465
  reranker = pipeline("text-classification", model="cross-encoder/ms-marco-MiniLM-L-12-v2")
466
  results_raw = rerank_results(results_raw, query, reranker)
@@ -506,7 +522,15 @@ def launch_interface(share=True):
506
  with gr.Tab("Simple"):
507
  file_input = gr.File(label="Upload File (Optional)")
508
  query_input = gr.Textbox(label="Search Query")
509
- embedding_models_input = gr.Textbox(label="Embedding Models (comma-separated, e.g. HuggingFace:paraphrase-miniLM,OpenAI:text-embedding-ada-002)")
 
 
 
 
 
 
 
 
510
  top_k_input = gr.Slider(1, 10, step=1, value=5, label="Top K")
511
 
512
  with gr.Tab("Advanced"):
@@ -520,13 +544,16 @@ def launch_interface(share=True):
520
  lang_input = gr.Dropdown(choices=["german", "english", "french"], label="Language", value="german")
521
 
522
  with gr.Tab("Optional"):
 
523
  optimize_vocab_input = gr.Checkbox(label="Optimize Vocabulary", value=False)
 
524
  phonetic_weight_input = gr.Slider(0, 1, step=0.1, value=0.3, label="Phonetic Matching Weight")
525
  custom_tokenizer_file_input = gr.File(label="Custom Tokenizer File (Optional)")
526
  custom_tokenizer_model_input = gr.Textbox(label="Custom Tokenizer Model (e.g., WordLevel, BPE, Unigram)")
527
  custom_tokenizer_vocab_size_input = gr.Textbox(label="Custom Tokenizer Vocab Size", value="10000")
528
  custom_tokenizer_special_tokens_input = gr.Textbox(label="Custom Tokenizer Special Tokens (comma-separated)")
529
  use_query_optimization_input = gr.Checkbox(label="Use Query Optimization", value=False)
 
530
  use_reranking_input = gr.Checkbox(label="Use Reranking", value=False)
531
 
532
  results_output = gr.Dataframe(label="Results", interactive=False)
@@ -540,13 +567,15 @@ def launch_interface(share=True):
540
  file_input, query_input, embedding_models_input, custom_embedding_model_input,
541
  split_strategy_input, chunk_size_input, overlap_size_input, custom_separators_input,
542
  vector_store_type_input, search_type_input, top_k_input, lang_input,
543
- optimize_vocab_input, phonetic_weight_input, custom_tokenizer_file_input,
544
- custom_tokenizer_model_input, custom_tokenizer_vocab_size_input,
545
- custom_tokenizer_special_tokens_input, use_query_optimization_input, use_reranking_input
 
546
  ],
547
  outputs=[results_output, stats_output, plot_output]
548
  )
549
 
 
550
  tutorial_md = """
551
  # Advanced Embedding Comparison Tool Tutorial
552
 
 
116
  def simple_tokenize(text):
117
  return text.split()
118
 
119
+ def preprocess_text(text, lang='german', apply_preprocessing=True):
120
+ if not apply_preprocessing:
121
+ return text
122
+
123
  text = text.lower()
124
  text = re.sub(r'[^a-zA-Z\s]', '', text)
125
 
 
144
 
145
  return ' '.join(tokens)
146
 
147
+ def phonetic_match(text, query, method='levenshtein_distance', apply_phonetic=True):
148
+ if not apply_phonetic:
149
+ return 0
150
  if method == 'levenshtein_distance':
151
  text_phonetic = jellyfish.soundex(text)
152
  query_phonetic = jellyfish.soundex(query)
153
  return jellyfish.levenshtein_distance(text_phonetic, query_phonetic)
154
  return 0
155
 
156
+ def optimize_query(query, llm_model):
157
+ llm = HuggingFacePipeline.from_model_id(
158
+ model_id=llm_model,
159
+ task="text2text-generation",
160
+ model_kwargs={"temperature": 0, "max_length": 64},
161
+ )
162
+ multi_query_retriever = MultiQueryRetriever.from_llm(
163
+ retriever=get_retriever(vector_store, search_type, search_kwargs),
164
+ llm=llm
165
+ )
166
+ optimized_queries = multi_query_retriever.generate_queries(query)
167
+ return optimized_queries
168
+
169
+
170
  def create_custom_embedding(texts, model_type='word2vec', vector_size=100, window=5, min_count=1):
171
  tokenized_texts = [text.split() for text in texts]
172
 
 
418
  return reranked_results
419
 
420
  # Main Comparison Function
421
+ def compare_embeddings(file, query, embedding_models, custom_embedding_model, split_strategy, chunk_size, overlap_size, custom_separators, vector_store_type, search_type, top_k, lang='german', apply_preprocessing=True, optimize_vocab=False, apply_phonetic=True, phonetic_weight=0.3, custom_tokenizer_file=None, custom_tokenizer_model=None, custom_tokenizer_vocab_size=10000, custom_tokenizer_special_tokens=None, use_query_optimization=False, query_optimization_model="google/flan-t5-base", use_reranking=False):
422
  all_results = []
423
  all_stats = []
424
  settings = {
 
450
  overlap_size,
451
  custom_separators.split(',') if custom_separators else None,
452
  lang,
453
+ apply_preprocessing,
454
  custom_tokenizer_file,
455
  custom_tokenizer_model,
456
  int(custom_tokenizer_vocab_size),
 
462
  chunks = optimized_chunks
463
 
464
  if use_query_optimization:
465
+ optimized_queries = optimize_query(query, query_optimization_model)
 
 
 
 
 
466
  query = " ".join(optimized_queries)
467
 
468
  results, search_time, vector_store, results_raw = search_embeddings(
 
473
  query,
474
  top_k,
475
  lang,
476
+ apply_phonetic,
477
  phonetic_weight
478
  )
479
+
480
  if use_reranking:
481
  reranker = pipeline("text-classification", model="cross-encoder/ms-marco-MiniLM-L-12-v2")
482
  results_raw = rerank_results(results_raw, query, reranker)
 
522
  with gr.Tab("Simple"):
523
  file_input = gr.File(label="Upload File (Optional)")
524
  query_input = gr.Textbox(label="Search Query")
525
+ embedding_models_input = gr.CheckboxGroup(
526
+ choices=[
527
+ "HuggingFace:paraphrase-miniLM",
528
+ "HuggingFace:paraphrase-mpnet",
529
+ "OpenAI:text-embedding-ada-002",
530
+ "Cohere:embed-multilingual-v2.0"
531
+ ],
532
+ label="Embedding Models"
533
+ )
534
  top_k_input = gr.Slider(1, 10, step=1, value=5, label="Top K")
535
 
536
  with gr.Tab("Advanced"):
 
544
  lang_input = gr.Dropdown(choices=["german", "english", "french"], label="Language", value="german")
545
 
546
  with gr.Tab("Optional"):
547
+ apply_preprocessing_input = gr.Checkbox(label="Apply Text Preprocessing", value=True)
548
  optimize_vocab_input = gr.Checkbox(label="Optimize Vocabulary", value=False)
549
+ apply_phonetic_input = gr.Checkbox(label="Apply Phonetic Matching", value=True)
550
  phonetic_weight_input = gr.Slider(0, 1, step=0.1, value=0.3, label="Phonetic Matching Weight")
551
  custom_tokenizer_file_input = gr.File(label="Custom Tokenizer File (Optional)")
552
  custom_tokenizer_model_input = gr.Textbox(label="Custom Tokenizer Model (e.g., WordLevel, BPE, Unigram)")
553
  custom_tokenizer_vocab_size_input = gr.Textbox(label="Custom Tokenizer Vocab Size", value="10000")
554
  custom_tokenizer_special_tokens_input = gr.Textbox(label="Custom Tokenizer Special Tokens (comma-separated)")
555
  use_query_optimization_input = gr.Checkbox(label="Use Query Optimization", value=False)
556
+ query_optimization_model_input = gr.Textbox(label="Query Optimization Model", value="google/flan-t5-base")
557
  use_reranking_input = gr.Checkbox(label="Use Reranking", value=False)
558
 
559
  results_output = gr.Dataframe(label="Results", interactive=False)
 
567
  file_input, query_input, embedding_models_input, custom_embedding_model_input,
568
  split_strategy_input, chunk_size_input, overlap_size_input, custom_separators_input,
569
  vector_store_type_input, search_type_input, top_k_input, lang_input,
570
+ apply_preprocessing_input, optimize_vocab_input, apply_phonetic_input,
571
+ phonetic_weight_input, custom_tokenizer_file_input, custom_tokenizer_model_input,
572
+ custom_tokenizer_vocab_size_input, custom_tokenizer_special_tokens_input,
573
+ use_query_optimization_input, query_optimization_model_input, use_reranking_input
574
  ],
575
  outputs=[results_output, stats_output, plot_output]
576
  )
577
 
578
+
579
  tutorial_md = """
580
  # Advanced Embedding Comparison Tool Tutorial
581