Update app.py
Browse files
app.py
CHANGED
@@ -171,18 +171,26 @@ class CustomEmbeddings(HuggingFaceEmbeddings):
|
|
171 |
|
172 |
|
173 |
# Custom Tokenizer
|
174 |
-
def create_custom_tokenizer(file_path):
|
175 |
with open(file_path, 'r', encoding='utf-8') as f:
|
176 |
text = f.read()
|
177 |
|
178 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
179 |
tokenizer.pre_tokenizer = Whitespace()
|
180 |
|
181 |
-
|
|
|
182 |
tokenizer.train_from_iterator([text], trainer)
|
183 |
|
184 |
return tokenizer
|
185 |
-
|
186 |
def custom_tokenize(text, tokenizer):
|
187 |
return tokenizer.encode(text).tokens
|
188 |
|
@@ -243,7 +251,7 @@ def get_retriever(vector_store, search_type, search_kwargs):
|
|
243 |
raise ValueError(f"Unsupported search type: {search_type}")
|
244 |
|
245 |
# Main Processing Functions
|
246 |
-
def process_files(file_path, model_type, model_name, split_strategy, chunk_size, overlap_size, custom_separators, lang='german', custom_tokenizer_file=None):
|
247 |
if file_path:
|
248 |
text = FileHandler.extract_text(file_path)
|
249 |
else:
|
@@ -253,7 +261,7 @@ def process_files(file_path, model_type, model_name, split_strategy, chunk_size,
|
|
253 |
text += FileHandler.extract_text(file_path)
|
254 |
|
255 |
if custom_tokenizer_file:
|
256 |
-
tokenizer = create_custom_tokenizer(custom_tokenizer_file)
|
257 |
text = ' '.join(custom_tokenize(text, tokenizer))
|
258 |
else:
|
259 |
text = preprocess_text(text, lang)
|
@@ -387,7 +395,7 @@ def optimize_vocabulary(texts, vocab_size=10000, min_frequency=2):
|
|
387 |
return tokenizer, optimized_texts
|
388 |
|
389 |
# Main Comparison Function
|
390 |
-
def compare_embeddings(file, query,
|
391 |
all_results = []
|
392 |
all_stats = []
|
393 |
settings = {
|
@@ -399,12 +407,16 @@ def compare_embeddings(file, query, model_types, model_names, split_strategy, ch
|
|
399 |
"search_type": search_type,
|
400 |
"top_k": top_k,
|
401 |
"lang": lang,
|
402 |
-
"use_custom_embedding": use_custom_embedding,
|
403 |
"optimize_vocab": optimize_vocab,
|
404 |
"phonetic_weight": phonetic_weight
|
405 |
}
|
406 |
|
407 |
-
|
|
|
|
|
|
|
|
|
|
|
408 |
# Process the file and generate chunks & embeddings
|
409 |
chunks, embedding_model, num_tokens = process_files(
|
410 |
file.name if file else None,
|
@@ -415,13 +427,16 @@ def compare_embeddings(file, query, model_types, model_names, split_strategy, ch
|
|
415 |
overlap_size,
|
416 |
custom_separators.split(',') if custom_separators else None,
|
417 |
lang,
|
418 |
-
custom_tokenizer_file
|
|
|
|
|
|
|
419 |
)
|
420 |
|
421 |
# Custom embedding handling
|
422 |
-
if use_custom_embedding:
|
423 |
-
|
424 |
-
|
425 |
|
426 |
# Optimizing vocabulary if required
|
427 |
if optimize_vocab:
|
@@ -490,8 +505,8 @@ def launch_interface(share=True):
|
|
490 |
inputs=[
|
491 |
gr.File(label="Upload File (Optional)"),
|
492 |
gr.Textbox(label="Search Query"),
|
493 |
-
gr.
|
494 |
-
gr.
|
495 |
gr.Radio(choices=["token", "recursive"], label="Split Strategy", value="recursive"),
|
496 |
gr.Slider(100, 1000, step=100, value=500, label="Chunk Size"),
|
497 |
gr.Slider(0, 100, step=10, value=50, label="Overlap Size"),
|
@@ -500,10 +515,12 @@ def launch_interface(share=True):
|
|
500 |
gr.Radio(choices=["similarity", "mmr", "custom"], label="Search Type", value="similarity"),
|
501 |
gr.Slider(1, 10, step=1, value=5, label="Top K"),
|
502 |
gr.Dropdown(choices=["german", "english", "french"], label="Language", value="german"),
|
503 |
-
gr.Checkbox(label="Use Custom Embedding", value=False),
|
504 |
gr.Checkbox(label="Optimize Vocabulary", value=False),
|
505 |
gr.Slider(0, 1, step=0.1, value=0.3, label="Phonetic Matching Weight"),
|
506 |
-
gr.File(label="Custom Tokenizer File (Optional)")
|
|
|
|
|
|
|
507 |
],
|
508 |
outputs=[
|
509 |
gr.Dataframe(label="Results", interactive=False),
|
@@ -523,13 +540,14 @@ def launch_interface(share=True):
|
|
523 |
|
524 |
1. Upload a file (optional) or use the default files in the system.
|
525 |
2. Enter a search query.
|
526 |
-
3.
|
527 |
-
4.
|
528 |
-
5.
|
529 |
-
6.
|
530 |
-
7.
|
531 |
-
8.
|
532 |
-
9.
|
|
|
533 |
|
534 |
The tool will process your query and display results, statistics, and visualizations to help you compare the performance of different models and strategies.
|
535 |
"""
|
@@ -539,7 +557,4 @@ def launch_interface(share=True):
|
|
539 |
["Embedding Comparison", "Tutorial"]
|
540 |
)
|
541 |
|
542 |
-
iface.launch(share=share)
|
543 |
-
|
544 |
-
if __name__ == "__main__":
|
545 |
-
launch_interface()
|
|
|
171 |
|
172 |
|
173 |
# Custom Tokenizer
|
174 |
+
def create_custom_tokenizer(file_path, model_type='WordLevel', vocab_size=10000, special_tokens=None):
|
175 |
with open(file_path, 'r', encoding='utf-8') as f:
|
176 |
text = f.read()
|
177 |
|
178 |
+
if model_type == 'WordLevel':
|
179 |
+
tokenizer = Tokenizer(WordLevel(unk_token="[UNK]"))
|
180 |
+
elif model_type == 'BPE':
|
181 |
+
tokenizer = Tokenizer(models.BPE(unk_token="[UNK]"))
|
182 |
+
elif model_type == 'Unigram':
|
183 |
+
tokenizer = Tokenizer(models.Unigram())
|
184 |
+
else:
|
185 |
+
raise ValueError(f"Unsupported tokenizer model: {model_type}")
|
186 |
+
|
187 |
tokenizer.pre_tokenizer = Whitespace()
|
188 |
|
189 |
+
special_tokens = special_tokens or ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
|
190 |
+
trainer = trainers.WordLevelTrainer(special_tokens=special_tokens, vocab_size=vocab_size)
|
191 |
tokenizer.train_from_iterator([text], trainer)
|
192 |
|
193 |
return tokenizer
|
|
|
194 |
def custom_tokenize(text, tokenizer):
|
195 |
return tokenizer.encode(text).tokens
|
196 |
|
|
|
251 |
raise ValueError(f"Unsupported search type: {search_type}")
|
252 |
|
253 |
# Main Processing Functions
|
254 |
+
def process_files(file_path, model_type, model_name, split_strategy, chunk_size, overlap_size, custom_separators, lang='german', custom_tokenizer_file=None, custom_tokenizer_model=None, custom_tokenizer_vocab_size=10000, custom_tokenizer_special_tokens=None):
|
255 |
if file_path:
|
256 |
text = FileHandler.extract_text(file_path)
|
257 |
else:
|
|
|
261 |
text += FileHandler.extract_text(file_path)
|
262 |
|
263 |
if custom_tokenizer_file:
|
264 |
+
tokenizer = create_custom_tokenizer(custom_tokenizer_file, custom_tokenizer_model, custom_tokenizer_vocab_size, custom_tokenizer_special_tokens)
|
265 |
text = ' '.join(custom_tokenize(text, tokenizer))
|
266 |
else:
|
267 |
text = preprocess_text(text, lang)
|
|
|
395 |
return tokenizer, optimized_texts
|
396 |
|
397 |
# Main Comparison Function
|
398 |
+
def compare_embeddings(file, query, embedding_models, custom_embedding_model, split_strategy, chunk_size, overlap_size, custom_separators, vector_store_type, search_type, top_k, lang='german', optimize_vocab=False, phonetic_weight=0.3, custom_tokenizer_file=None, custom_tokenizer_model=None, custom_tokenizer_vocab_size=10000, custom_tokenizer_special_tokens=None):
|
399 |
all_results = []
|
400 |
all_stats = []
|
401 |
settings = {
|
|
|
407 |
"search_type": search_type,
|
408 |
"top_k": top_k,
|
409 |
"lang": lang,
|
|
|
410 |
"optimize_vocab": optimize_vocab,
|
411 |
"phonetic_weight": phonetic_weight
|
412 |
}
|
413 |
|
414 |
+
# Parse embedding models
|
415 |
+
models = [model.strip().split(':') for model in embedding_models.split(',')]
|
416 |
+
if custom_embedding_model:
|
417 |
+
models.append(custom_embedding_model.strip().split(':'))
|
418 |
+
|
419 |
+
for model_type, model_name in models:
|
420 |
# Process the file and generate chunks & embeddings
|
421 |
chunks, embedding_model, num_tokens = process_files(
|
422 |
file.name if file else None,
|
|
|
427 |
overlap_size,
|
428 |
custom_separators.split(',') if custom_separators else None,
|
429 |
lang,
|
430 |
+
custom_tokenizer_file,
|
431 |
+
custom_tokenizer_model,
|
432 |
+
int(custom_tokenizer_vocab_size),
|
433 |
+
custom_tokenizer_special_tokens.split(',') if custom_tokenizer_special_tokens else None
|
434 |
)
|
435 |
|
436 |
# Custom embedding handling
|
437 |
+
#if use_custom_embedding:
|
438 |
+
# custom_model = create_custom_embedding(chunks) #add custom model by name, must com from gradio FE
|
439 |
+
# embedding_model = CustomEmbeddings(custom_model)
|
440 |
|
441 |
# Optimizing vocabulary if required
|
442 |
if optimize_vocab:
|
|
|
505 |
inputs=[
|
506 |
gr.File(label="Upload File (Optional)"),
|
507 |
gr.Textbox(label="Search Query"),
|
508 |
+
gr.Textbox(label="Embedding Models (comma-separated, e.g. HuggingFace:paraphrase-miniLM,OpenAI:text-embedding-ada-002)"),
|
509 |
+
gr.Textbox(label="Custom Embedding Model (optional, format: type:name)"),
|
510 |
gr.Radio(choices=["token", "recursive"], label="Split Strategy", value="recursive"),
|
511 |
gr.Slider(100, 1000, step=100, value=500, label="Chunk Size"),
|
512 |
gr.Slider(0, 100, step=10, value=50, label="Overlap Size"),
|
|
|
515 |
gr.Radio(choices=["similarity", "mmr", "custom"], label="Search Type", value="similarity"),
|
516 |
gr.Slider(1, 10, step=1, value=5, label="Top K"),
|
517 |
gr.Dropdown(choices=["german", "english", "french"], label="Language", value="german"),
|
|
|
518 |
gr.Checkbox(label="Optimize Vocabulary", value=False),
|
519 |
gr.Slider(0, 1, step=0.1, value=0.3, label="Phonetic Matching Weight"),
|
520 |
+
gr.File(label="Custom Tokenizer File (Optional)"),
|
521 |
+
gr.Textbox(label="Custom Tokenizer Model (e.g., WordLevel, BPE, Unigram)"),
|
522 |
+
gr.Textbox(label="Custom Tokenizer Vocab Size", value="10000"),
|
523 |
+
gr.Textbox(label="Custom Tokenizer Special Tokens (comma-separated)")
|
524 |
],
|
525 |
outputs=[
|
526 |
gr.Dataframe(label="Results", interactive=False),
|
|
|
540 |
|
541 |
1. Upload a file (optional) or use the default files in the system.
|
542 |
2. Enter a search query.
|
543 |
+
3. Enter embedding models as a comma-separated list (e.g., HuggingFace:paraphrase-miniLM,OpenAI:text-embedding-ada-002).
|
544 |
+
4. Optionally, specify a custom embedding model in the format type:name.
|
545 |
+
5. Choose a text splitting strategy and set chunk size and overlap.
|
546 |
+
6. Select a vector store type and search type.
|
547 |
+
7. Set the number of top results to retrieve.
|
548 |
+
8. Choose the language of your documents.
|
549 |
+
9. Optionally, optimize vocabulary or adjust phonetic matching weight.
|
550 |
+
10. If you have a custom tokenizer, upload the file and specify its attributes.
|
551 |
|
552 |
The tool will process your query and display results, statistics, and visualizations to help you compare the performance of different models and strategies.
|
553 |
"""
|
|
|
557 |
["Embedding Comparison", "Tutorial"]
|
558 |
)
|
559 |
|
560 |
+
iface.launch(share=share)
|
|
|
|
|
|