Update app.py
Browse files
app.py
CHANGED
@@ -116,7 +116,10 @@ class FileHandler:
|
|
116 |
def simple_tokenize(text):
|
117 |
return text.split()
|
118 |
|
119 |
-
def preprocess_text(text, lang='german'):
|
|
|
|
|
|
|
120 |
text = text.lower()
|
121 |
text = re.sub(r'[^a-zA-Z\s]', '', text)
|
122 |
|
@@ -141,13 +144,29 @@ def preprocess_text(text, lang='german'):
|
|
141 |
|
142 |
return ' '.join(tokens)
|
143 |
|
144 |
-
def phonetic_match(text, query, method='levenshtein_distance'):
|
|
|
|
|
145 |
if method == 'levenshtein_distance':
|
146 |
text_phonetic = jellyfish.soundex(text)
|
147 |
query_phonetic = jellyfish.soundex(query)
|
148 |
return jellyfish.levenshtein_distance(text_phonetic, query_phonetic)
|
149 |
return 0
|
150 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
151 |
def create_custom_embedding(texts, model_type='word2vec', vector_size=100, window=5, min_count=1):
|
152 |
tokenized_texts = [text.split() for text in texts]
|
153 |
|
@@ -399,7 +418,7 @@ def rerank_results(results, query, reranker):
|
|
399 |
return reranked_results
|
400 |
|
401 |
# Main Comparison Function
|
402 |
-
def compare_embeddings(file, query, embedding_models, custom_embedding_model, split_strategy, chunk_size, overlap_size, custom_separators, vector_store_type, search_type, top_k, lang='german', optimize_vocab=False, phonetic_weight=0.3, custom_tokenizer_file=None, custom_tokenizer_model=None, custom_tokenizer_vocab_size=10000, custom_tokenizer_special_tokens=None, use_query_optimization=False, use_reranking=False):
|
403 |
all_results = []
|
404 |
all_stats = []
|
405 |
settings = {
|
@@ -431,6 +450,7 @@ def compare_embeddings(file, query, embedding_models, custom_embedding_model, sp
|
|
431 |
overlap_size,
|
432 |
custom_separators.split(',') if custom_separators else None,
|
433 |
lang,
|
|
|
434 |
custom_tokenizer_file,
|
435 |
custom_tokenizer_model,
|
436 |
int(custom_tokenizer_vocab_size),
|
@@ -442,12 +462,7 @@ def compare_embeddings(file, query, embedding_models, custom_embedding_model, sp
|
|
442 |
chunks = optimized_chunks
|
443 |
|
444 |
if use_query_optimization:
|
445 |
-
|
446 |
-
model_id="google/flan-t5-base",
|
447 |
-
task="text2text-generation",
|
448 |
-
model_kwargs={"temperature": 0, "max_length": 64},
|
449 |
-
)
|
450 |
-
optimized_queries = optimize_query(query, llm)
|
451 |
query = " ".join(optimized_queries)
|
452 |
|
453 |
results, search_time, vector_store, results_raw = search_embeddings(
|
@@ -458,9 +473,10 @@ def compare_embeddings(file, query, embedding_models, custom_embedding_model, sp
|
|
458 |
query,
|
459 |
top_k,
|
460 |
lang,
|
|
|
461 |
phonetic_weight
|
462 |
)
|
463 |
-
|
464 |
if use_reranking:
|
465 |
reranker = pipeline("text-classification", model="cross-encoder/ms-marco-MiniLM-L-12-v2")
|
466 |
results_raw = rerank_results(results_raw, query, reranker)
|
@@ -506,7 +522,15 @@ def launch_interface(share=True):
|
|
506 |
with gr.Tab("Simple"):
|
507 |
file_input = gr.File(label="Upload File (Optional)")
|
508 |
query_input = gr.Textbox(label="Search Query")
|
509 |
-
embedding_models_input = gr.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
510 |
top_k_input = gr.Slider(1, 10, step=1, value=5, label="Top K")
|
511 |
|
512 |
with gr.Tab("Advanced"):
|
@@ -520,13 +544,16 @@ def launch_interface(share=True):
|
|
520 |
lang_input = gr.Dropdown(choices=["german", "english", "french"], label="Language", value="german")
|
521 |
|
522 |
with gr.Tab("Optional"):
|
|
|
523 |
optimize_vocab_input = gr.Checkbox(label="Optimize Vocabulary", value=False)
|
|
|
524 |
phonetic_weight_input = gr.Slider(0, 1, step=0.1, value=0.3, label="Phonetic Matching Weight")
|
525 |
custom_tokenizer_file_input = gr.File(label="Custom Tokenizer File (Optional)")
|
526 |
custom_tokenizer_model_input = gr.Textbox(label="Custom Tokenizer Model (e.g., WordLevel, BPE, Unigram)")
|
527 |
custom_tokenizer_vocab_size_input = gr.Textbox(label="Custom Tokenizer Vocab Size", value="10000")
|
528 |
custom_tokenizer_special_tokens_input = gr.Textbox(label="Custom Tokenizer Special Tokens (comma-separated)")
|
529 |
use_query_optimization_input = gr.Checkbox(label="Use Query Optimization", value=False)
|
|
|
530 |
use_reranking_input = gr.Checkbox(label="Use Reranking", value=False)
|
531 |
|
532 |
results_output = gr.Dataframe(label="Results", interactive=False)
|
@@ -540,13 +567,15 @@ def launch_interface(share=True):
|
|
540 |
file_input, query_input, embedding_models_input, custom_embedding_model_input,
|
541 |
split_strategy_input, chunk_size_input, overlap_size_input, custom_separators_input,
|
542 |
vector_store_type_input, search_type_input, top_k_input, lang_input,
|
543 |
-
|
544 |
-
|
545 |
-
|
|
|
546 |
],
|
547 |
outputs=[results_output, stats_output, plot_output]
|
548 |
)
|
549 |
|
|
|
550 |
tutorial_md = """
|
551 |
# Advanced Embedding Comparison Tool Tutorial
|
552 |
|
|
|
116 |
def simple_tokenize(text):
|
117 |
return text.split()
|
118 |
|
119 |
+
def preprocess_text(text, lang='german', apply_preprocessing=True):
|
120 |
+
if not apply_preprocessing:
|
121 |
+
return text
|
122 |
+
|
123 |
text = text.lower()
|
124 |
text = re.sub(r'[^a-zA-Z\s]', '', text)
|
125 |
|
|
|
144 |
|
145 |
return ' '.join(tokens)
|
146 |
|
147 |
+
def phonetic_match(text, query, method='levenshtein_distance', apply_phonetic=True):
|
148 |
+
if not apply_phonetic:
|
149 |
+
return 0
|
150 |
if method == 'levenshtein_distance':
|
151 |
text_phonetic = jellyfish.soundex(text)
|
152 |
query_phonetic = jellyfish.soundex(query)
|
153 |
return jellyfish.levenshtein_distance(text_phonetic, query_phonetic)
|
154 |
return 0
|
155 |
|
156 |
+
def optimize_query(query, llm_model):
|
157 |
+
llm = HuggingFacePipeline.from_model_id(
|
158 |
+
model_id=llm_model,
|
159 |
+
task="text2text-generation",
|
160 |
+
model_kwargs={"temperature": 0, "max_length": 64},
|
161 |
+
)
|
162 |
+
multi_query_retriever = MultiQueryRetriever.from_llm(
|
163 |
+
retriever=get_retriever(vector_store, search_type, search_kwargs),
|
164 |
+
llm=llm
|
165 |
+
)
|
166 |
+
optimized_queries = multi_query_retriever.generate_queries(query)
|
167 |
+
return optimized_queries
|
168 |
+
|
169 |
+
|
170 |
def create_custom_embedding(texts, model_type='word2vec', vector_size=100, window=5, min_count=1):
|
171 |
tokenized_texts = [text.split() for text in texts]
|
172 |
|
|
|
418 |
return reranked_results
|
419 |
|
420 |
# Main Comparison Function
|
421 |
+
def compare_embeddings(file, query, embedding_models, custom_embedding_model, split_strategy, chunk_size, overlap_size, custom_separators, vector_store_type, search_type, top_k, lang='german', apply_preprocessing=True, optimize_vocab=False, apply_phonetic=True, phonetic_weight=0.3, custom_tokenizer_file=None, custom_tokenizer_model=None, custom_tokenizer_vocab_size=10000, custom_tokenizer_special_tokens=None, use_query_optimization=False, query_optimization_model="google/flan-t5-base", use_reranking=False):
|
422 |
all_results = []
|
423 |
all_stats = []
|
424 |
settings = {
|
|
|
450 |
overlap_size,
|
451 |
custom_separators.split(',') if custom_separators else None,
|
452 |
lang,
|
453 |
+
apply_preprocessing,
|
454 |
custom_tokenizer_file,
|
455 |
custom_tokenizer_model,
|
456 |
int(custom_tokenizer_vocab_size),
|
|
|
462 |
chunks = optimized_chunks
|
463 |
|
464 |
if use_query_optimization:
|
465 |
+
optimized_queries = optimize_query(query, query_optimization_model)
|
|
|
|
|
|
|
|
|
|
|
466 |
query = " ".join(optimized_queries)
|
467 |
|
468 |
results, search_time, vector_store, results_raw = search_embeddings(
|
|
|
473 |
query,
|
474 |
top_k,
|
475 |
lang,
|
476 |
+
apply_phonetic,
|
477 |
phonetic_weight
|
478 |
)
|
479 |
+
|
480 |
if use_reranking:
|
481 |
reranker = pipeline("text-classification", model="cross-encoder/ms-marco-MiniLM-L-12-v2")
|
482 |
results_raw = rerank_results(results_raw, query, reranker)
|
|
|
522 |
with gr.Tab("Simple"):
|
523 |
file_input = gr.File(label="Upload File (Optional)")
|
524 |
query_input = gr.Textbox(label="Search Query")
|
525 |
+
embedding_models_input = gr.CheckboxGroup(
|
526 |
+
choices=[
|
527 |
+
"HuggingFace:paraphrase-miniLM",
|
528 |
+
"HuggingFace:paraphrase-mpnet",
|
529 |
+
"OpenAI:text-embedding-ada-002",
|
530 |
+
"Cohere:embed-multilingual-v2.0"
|
531 |
+
],
|
532 |
+
label="Embedding Models"
|
533 |
+
)
|
534 |
top_k_input = gr.Slider(1, 10, step=1, value=5, label="Top K")
|
535 |
|
536 |
with gr.Tab("Advanced"):
|
|
|
544 |
lang_input = gr.Dropdown(choices=["german", "english", "french"], label="Language", value="german")
|
545 |
|
546 |
with gr.Tab("Optional"):
|
547 |
+
apply_preprocessing_input = gr.Checkbox(label="Apply Text Preprocessing", value=True)
|
548 |
optimize_vocab_input = gr.Checkbox(label="Optimize Vocabulary", value=False)
|
549 |
+
apply_phonetic_input = gr.Checkbox(label="Apply Phonetic Matching", value=True)
|
550 |
phonetic_weight_input = gr.Slider(0, 1, step=0.1, value=0.3, label="Phonetic Matching Weight")
|
551 |
custom_tokenizer_file_input = gr.File(label="Custom Tokenizer File (Optional)")
|
552 |
custom_tokenizer_model_input = gr.Textbox(label="Custom Tokenizer Model (e.g., WordLevel, BPE, Unigram)")
|
553 |
custom_tokenizer_vocab_size_input = gr.Textbox(label="Custom Tokenizer Vocab Size", value="10000")
|
554 |
custom_tokenizer_special_tokens_input = gr.Textbox(label="Custom Tokenizer Special Tokens (comma-separated)")
|
555 |
use_query_optimization_input = gr.Checkbox(label="Use Query Optimization", value=False)
|
556 |
+
query_optimization_model_input = gr.Textbox(label="Query Optimization Model", value="google/flan-t5-base")
|
557 |
use_reranking_input = gr.Checkbox(label="Use Reranking", value=False)
|
558 |
|
559 |
results_output = gr.Dataframe(label="Results", interactive=False)
|
|
|
567 |
file_input, query_input, embedding_models_input, custom_embedding_model_input,
|
568 |
split_strategy_input, chunk_size_input, overlap_size_input, custom_separators_input,
|
569 |
vector_store_type_input, search_type_input, top_k_input, lang_input,
|
570 |
+
apply_preprocessing_input, optimize_vocab_input, apply_phonetic_input,
|
571 |
+
phonetic_weight_input, custom_tokenizer_file_input, custom_tokenizer_model_input,
|
572 |
+
custom_tokenizer_vocab_size_input, custom_tokenizer_special_tokens_input,
|
573 |
+
use_query_optimization_input, query_optimization_model_input, use_reranking_input
|
574 |
],
|
575 |
outputs=[results_output, stats_output, plot_output]
|
576 |
)
|
577 |
|
578 |
+
|
579 |
tutorial_md = """
|
580 |
# Advanced Embedding Comparison Tool Tutorial
|
581 |
|