Chris4K commited on
Commit
c77f8ac
·
verified ·
1 Parent(s): ce988dc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +43 -28
app.py CHANGED
@@ -171,18 +171,26 @@ class CustomEmbeddings(HuggingFaceEmbeddings):
171
 
172
 
173
  # Custom Tokenizer
174
- def create_custom_tokenizer(file_path):
175
  with open(file_path, 'r', encoding='utf-8') as f:
176
  text = f.read()
177
 
178
- tokenizer = Tokenizer(WordLevel(unk_token="[UNK]"))
 
 
 
 
 
 
 
 
179
  tokenizer.pre_tokenizer = Whitespace()
180
 
181
- trainer = WordLevelTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
 
182
  tokenizer.train_from_iterator([text], trainer)
183
 
184
  return tokenizer
185
-
186
  def custom_tokenize(text, tokenizer):
187
  return tokenizer.encode(text).tokens
188
 
@@ -243,7 +251,7 @@ def get_retriever(vector_store, search_type, search_kwargs):
243
  raise ValueError(f"Unsupported search type: {search_type}")
244
 
245
  # Main Processing Functions
246
- def process_files(file_path, model_type, model_name, split_strategy, chunk_size, overlap_size, custom_separators, lang='german', custom_tokenizer_file=None):
247
  if file_path:
248
  text = FileHandler.extract_text(file_path)
249
  else:
@@ -253,7 +261,7 @@ def process_files(file_path, model_type, model_name, split_strategy, chunk_size,
253
  text += FileHandler.extract_text(file_path)
254
 
255
  if custom_tokenizer_file:
256
- tokenizer = create_custom_tokenizer(custom_tokenizer_file)
257
  text = ' '.join(custom_tokenize(text, tokenizer))
258
  else:
259
  text = preprocess_text(text, lang)
@@ -387,7 +395,7 @@ def optimize_vocabulary(texts, vocab_size=10000, min_frequency=2):
387
  return tokenizer, optimized_texts
388
 
389
  # Main Comparison Function
390
- def compare_embeddings(file, query, model_types, model_names, split_strategy, chunk_size, overlap_size, custom_separators, vector_store_type, search_type, top_k, lang='german', use_custom_embedding=False, optimize_vocab=False, phonetic_weight=0.3, custom_tokenizer_file=None):
391
  all_results = []
392
  all_stats = []
393
  settings = {
@@ -399,12 +407,16 @@ def compare_embeddings(file, query, model_types, model_names, split_strategy, ch
399
  "search_type": search_type,
400
  "top_k": top_k,
401
  "lang": lang,
402
- "use_custom_embedding": use_custom_embedding,
403
  "optimize_vocab": optimize_vocab,
404
  "phonetic_weight": phonetic_weight
405
  }
406
 
407
- for model_type, model_name in zip(model_types, model_names):
 
 
 
 
 
408
  # Process the file and generate chunks & embeddings
409
  chunks, embedding_model, num_tokens = process_files(
410
  file.name if file else None,
@@ -415,13 +427,16 @@ def compare_embeddings(file, query, model_types, model_names, split_strategy, ch
415
  overlap_size,
416
  custom_separators.split(',') if custom_separators else None,
417
  lang,
418
- custom_tokenizer_file
 
 
 
419
  )
420
 
421
  # Custom embedding handling
422
- if use_custom_embedding:
423
- custom_model = create_custom_embedding(chunks) #add custom model by name, must com from gradio FE
424
- embedding_model = CustomEmbeddings(custom_model)
425
 
426
  # Optimizing vocabulary if required
427
  if optimize_vocab:
@@ -490,8 +505,8 @@ def launch_interface(share=True):
490
  inputs=[
491
  gr.File(label="Upload File (Optional)"),
492
  gr.Textbox(label="Search Query"),
493
- gr.CheckboxGroup(choices=list(model_manager.list_models().keys()) + ["Custom"], label="Embedding Model Types"),
494
- gr.CheckboxGroup(choices=[model for models in model_manager.list_models().values() for model in models] + ["custom_model"], label="Embedding Models"),
495
  gr.Radio(choices=["token", "recursive"], label="Split Strategy", value="recursive"),
496
  gr.Slider(100, 1000, step=100, value=500, label="Chunk Size"),
497
  gr.Slider(0, 100, step=10, value=50, label="Overlap Size"),
@@ -500,10 +515,12 @@ def launch_interface(share=True):
500
  gr.Radio(choices=["similarity", "mmr", "custom"], label="Search Type", value="similarity"),
501
  gr.Slider(1, 10, step=1, value=5, label="Top K"),
502
  gr.Dropdown(choices=["german", "english", "french"], label="Language", value="german"),
503
- gr.Checkbox(label="Use Custom Embedding", value=False),
504
  gr.Checkbox(label="Optimize Vocabulary", value=False),
505
  gr.Slider(0, 1, step=0.1, value=0.3, label="Phonetic Matching Weight"),
506
- gr.File(label="Custom Tokenizer File (Optional)")
 
 
 
507
  ],
508
  outputs=[
509
  gr.Dataframe(label="Results", interactive=False),
@@ -523,13 +540,14 @@ def launch_interface(share=True):
523
 
524
  1. Upload a file (optional) or use the default files in the system.
525
  2. Enter a search query.
526
- 3. Select one or more embedding model types and specific models.
527
- 4. Choose a text splitting strategy and set chunk size and overlap.
528
- 5. Select a vector store type and search type.
529
- 6. Set the number of top results to retrieve.
530
- 7. Choose the language of your documents.
531
- 8. Optionally, use custom embeddings, optimize vocabulary, or adjust phonetic matching weight.
532
- 9. If you have a custom tokenizer, upload the file.
 
533
 
534
  The tool will process your query and display results, statistics, and visualizations to help you compare the performance of different models and strategies.
535
  """
@@ -539,7 +557,4 @@ def launch_interface(share=True):
539
  ["Embedding Comparison", "Tutorial"]
540
  )
541
 
542
- iface.launch(share=share)
543
-
544
- if __name__ == "__main__":
545
- launch_interface()
 
171
 
172
 
173
  # Custom Tokenizer
174
+ def create_custom_tokenizer(file_path, model_type='WordLevel', vocab_size=10000, special_tokens=None):
175
  with open(file_path, 'r', encoding='utf-8') as f:
176
  text = f.read()
177
 
178
+ if model_type == 'WordLevel':
179
+ tokenizer = Tokenizer(WordLevel(unk_token="[UNK]"))
180
+ elif model_type == 'BPE':
181
+ tokenizer = Tokenizer(models.BPE(unk_token="[UNK]"))
182
+ elif model_type == 'Unigram':
183
+ tokenizer = Tokenizer(models.Unigram())
184
+ else:
185
+ raise ValueError(f"Unsupported tokenizer model: {model_type}")
186
+
187
  tokenizer.pre_tokenizer = Whitespace()
188
 
189
+ special_tokens = special_tokens or ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
190
+ trainer = trainers.WordLevelTrainer(special_tokens=special_tokens, vocab_size=vocab_size)
191
  tokenizer.train_from_iterator([text], trainer)
192
 
193
  return tokenizer
 
194
  def custom_tokenize(text, tokenizer):
195
  return tokenizer.encode(text).tokens
196
 
 
251
  raise ValueError(f"Unsupported search type: {search_type}")
252
 
253
  # Main Processing Functions
254
+ def process_files(file_path, model_type, model_name, split_strategy, chunk_size, overlap_size, custom_separators, lang='german', custom_tokenizer_file=None, custom_tokenizer_model=None, custom_tokenizer_vocab_size=10000, custom_tokenizer_special_tokens=None):
255
  if file_path:
256
  text = FileHandler.extract_text(file_path)
257
  else:
 
261
  text += FileHandler.extract_text(file_path)
262
 
263
  if custom_tokenizer_file:
264
+ tokenizer = create_custom_tokenizer(custom_tokenizer_file, custom_tokenizer_model, custom_tokenizer_vocab_size, custom_tokenizer_special_tokens)
265
  text = ' '.join(custom_tokenize(text, tokenizer))
266
  else:
267
  text = preprocess_text(text, lang)
 
395
  return tokenizer, optimized_texts
396
 
397
  # Main Comparison Function
398
+ def compare_embeddings(file, query, embedding_models, custom_embedding_model, split_strategy, chunk_size, overlap_size, custom_separators, vector_store_type, search_type, top_k, lang='german', optimize_vocab=False, phonetic_weight=0.3, custom_tokenizer_file=None, custom_tokenizer_model=None, custom_tokenizer_vocab_size=10000, custom_tokenizer_special_tokens=None):
399
  all_results = []
400
  all_stats = []
401
  settings = {
 
407
  "search_type": search_type,
408
  "top_k": top_k,
409
  "lang": lang,
 
410
  "optimize_vocab": optimize_vocab,
411
  "phonetic_weight": phonetic_weight
412
  }
413
 
414
+ # Parse embedding models
415
+ models = [model.strip().split(':') for model in embedding_models.split(',')]
416
+ if custom_embedding_model:
417
+ models.append(custom_embedding_model.strip().split(':'))
418
+
419
+ for model_type, model_name in models:
420
  # Process the file and generate chunks & embeddings
421
  chunks, embedding_model, num_tokens = process_files(
422
  file.name if file else None,
 
427
  overlap_size,
428
  custom_separators.split(',') if custom_separators else None,
429
  lang,
430
+ custom_tokenizer_file,
431
+ custom_tokenizer_model,
432
+ int(custom_tokenizer_vocab_size),
433
+ custom_tokenizer_special_tokens.split(',') if custom_tokenizer_special_tokens else None
434
  )
435
 
436
  # Custom embedding handling
437
+ #if use_custom_embedding:
438
+ # custom_model = create_custom_embedding(chunks) #add custom model by name, must com from gradio FE
439
+ # embedding_model = CustomEmbeddings(custom_model)
440
 
441
  # Optimizing vocabulary if required
442
  if optimize_vocab:
 
505
  inputs=[
506
  gr.File(label="Upload File (Optional)"),
507
  gr.Textbox(label="Search Query"),
508
+ gr.Textbox(label="Embedding Models (comma-separated, e.g. HuggingFace:paraphrase-miniLM,OpenAI:text-embedding-ada-002)"),
509
+ gr.Textbox(label="Custom Embedding Model (optional, format: type:name)"),
510
  gr.Radio(choices=["token", "recursive"], label="Split Strategy", value="recursive"),
511
  gr.Slider(100, 1000, step=100, value=500, label="Chunk Size"),
512
  gr.Slider(0, 100, step=10, value=50, label="Overlap Size"),
 
515
  gr.Radio(choices=["similarity", "mmr", "custom"], label="Search Type", value="similarity"),
516
  gr.Slider(1, 10, step=1, value=5, label="Top K"),
517
  gr.Dropdown(choices=["german", "english", "french"], label="Language", value="german"),
 
518
  gr.Checkbox(label="Optimize Vocabulary", value=False),
519
  gr.Slider(0, 1, step=0.1, value=0.3, label="Phonetic Matching Weight"),
520
+ gr.File(label="Custom Tokenizer File (Optional)"),
521
+ gr.Textbox(label="Custom Tokenizer Model (e.g., WordLevel, BPE, Unigram)"),
522
+ gr.Textbox(label="Custom Tokenizer Vocab Size", value="10000"),
523
+ gr.Textbox(label="Custom Tokenizer Special Tokens (comma-separated)")
524
  ],
525
  outputs=[
526
  gr.Dataframe(label="Results", interactive=False),
 
540
 
541
  1. Upload a file (optional) or use the default files in the system.
542
  2. Enter a search query.
543
+ 3. Enter embedding models as a comma-separated list (e.g., HuggingFace:paraphrase-miniLM,OpenAI:text-embedding-ada-002).
544
+ 4. Optionally, specify a custom embedding model in the format type:name.
545
+ 5. Choose a text splitting strategy and set chunk size and overlap.
546
+ 6. Select a vector store type and search type.
547
+ 7. Set the number of top results to retrieve.
548
+ 8. Choose the language of your documents.
549
+ 9. Optionally, optimize vocabulary or adjust phonetic matching weight.
550
+ 10. If you have a custom tokenizer, upload the file and specify its attributes.
551
 
552
  The tool will process your query and display results, statistics, and visualizations to help you compare the performance of different models and strategies.
553
  """
 
557
  ["Embedding Comparison", "Tutorial"]
558
  )
559
 
560
+ iface.launch(share=share)