Update app.py
Browse files
app.py
CHANGED
@@ -41,8 +41,8 @@ from huggingface_hub import login
|
|
41 |
from typing import List, Tuple, Optional
|
42 |
|
43 |
|
44 |
-
hf_token = os.getenv("hf_token")
|
45 |
-
login(token=hf_token)
|
46 |
|
47 |
# Define the model pipeline with additional generation parameters
|
48 |
#model_pipeline = pipeline(
|
@@ -154,28 +154,28 @@ class ModelManager:
|
|
154 |
}
|
155 |
}
|
156 |
|
157 |
-
|
158 |
def update_model_ranking(self, model_id: str, score: float, feedback: str = None):
|
159 |
"""Update model ranking based on performance and optional feedback"""
|
160 |
current_score = self.rankings.get(model_id, 0.0)
|
161 |
# Weighted average of current score and new score
|
162 |
self.rankings[model_id] = 0.7 * current_score + 0.3 * score
|
163 |
-
|
164 |
if feedback:
|
165 |
if model_id not in self.model_stats:
|
166 |
self.model_stats[model_id] = {"feedback_count": 0, "feedback": []}
|
167 |
self.model_stats[model_id]["feedback_count"] += 1
|
168 |
self.model_stats[model_id]["feedback"].append(feedback)
|
169 |
-
|
170 |
def get_top_models(self, n: int = 5) -> List[Tuple[str, float]]:
|
171 |
"""Get top n ranked models"""
|
172 |
return sorted(self.rankings.items(), key=lambda x: x[1], reverse=True)[:n]
|
173 |
-
|
174 |
def get_model_stats(self, model_id: str) -> Dict[str, Any]:
|
175 |
"""Get statistics for a specific model"""
|
176 |
return self.model_stats.get(model_id, {})
|
177 |
|
178 |
-
|
179 |
def add_model(self, provider, name, model_path):
|
180 |
if provider not in self.models:
|
181 |
self.models[provider] = {}
|
@@ -286,29 +286,29 @@ def simple_tokenize(text):
|
|
286 |
def preprocess_text(text, lang='german', apply_preprocessing=False):
|
287 |
if not apply_preprocessing:
|
288 |
return text
|
289 |
-
|
290 |
text = text.lower()
|
291 |
text = re.sub(r'[^a-zA-Z\s]', '', text)
|
292 |
-
|
293 |
try:
|
294 |
tokens = word_tokenize(text, language=lang)
|
295 |
except LookupError:
|
296 |
print(f"Warning: NLTK punkt tokenizer for {lang} not found. Using simple tokenization.")
|
297 |
tokens = simple_tokenize(text)
|
298 |
-
|
299 |
try:
|
300 |
stop_words = set(stopwords.words(lang))
|
301 |
except LookupError:
|
302 |
print(f"Warning: Stopwords for {lang} not found. Skipping stopword removal.")
|
303 |
stop_words = set()
|
304 |
tokens = [token for token in tokens if token not in stop_words]
|
305 |
-
|
306 |
try:
|
307 |
stemmer = SnowballStemmer(lang)
|
308 |
tokens = [stemmer.stem(token) for token in tokens]
|
309 |
except ValueError:
|
310 |
print(f"Warning: SnowballStemmer for {lang} not available. Skipping stemming.")
|
311 |
-
|
312 |
return ' '.join(tokens)
|
313 |
|
314 |
def phonetic_match(text, query, method='levenshtein_distance', apply_phonetic=False):
|
@@ -341,7 +341,7 @@ def optimize_query(
|
|
341 |
) -> str:
|
342 |
"""
|
343 |
CPU-optimized version of query expansion using a small language model.
|
344 |
-
|
345 |
Args:
|
346 |
query: Original search query
|
347 |
query_optimization_model: Name or path of the model to use for optimization
|
@@ -351,17 +351,17 @@ def optimize_query(
|
|
351 |
search_type: Type of search being performed
|
352 |
top_k: Number of expansion terms to add
|
353 |
use_gpu: Whether to use GPU if available (defaults to False for CPU)
|
354 |
-
|
355 |
Returns:
|
356 |
Expanded query string
|
357 |
"""
|
358 |
try:
|
359 |
# Set device
|
360 |
device = "cuda" if use_gpu and torch.cuda.is_available() else "cpu"
|
361 |
-
|
362 |
# 1. Basic text preprocessing (CPU-based)
|
363 |
tokens = word_tokenize(query.lower())
|
364 |
-
|
365 |
# 2. WordNet synonyms expansion (CPU-based)
|
366 |
expanded_terms = set()
|
367 |
for token in tokens:
|
@@ -370,7 +370,7 @@ def optimize_query(
|
|
370 |
for syn in synsets:
|
371 |
# Limit number of lemmas
|
372 |
expanded_terms.update([lemma.name() for lemma in syn.lemmas()[:2]])
|
373 |
-
|
374 |
# 3. Use provided model with reduced complexity
|
375 |
try:
|
376 |
# Load model with reduced memory footprint
|
@@ -384,11 +384,11 @@ def optimize_query(
|
|
384 |
low_cpu_mem_usage=True,
|
385 |
device_map="cpu"
|
386 |
)
|
387 |
-
|
388 |
# Move model to CPU and eval mode
|
389 |
model = model.to(device)
|
390 |
model.eval()
|
391 |
-
|
392 |
# Prepare input with reduced length
|
393 |
prompt = f"Enhance this search query with relevant terms: {query}"
|
394 |
inputs = tokenizer(
|
@@ -398,7 +398,7 @@ def optimize_query(
|
|
398 |
truncation=True,
|
399 |
padding=True
|
400 |
)
|
401 |
-
|
402 |
# Generate with minimal parameters
|
403 |
with torch.no_grad():
|
404 |
outputs = model.generate(
|
@@ -409,41 +409,41 @@ def optimize_query(
|
|
409 |
do_sample=False,
|
410 |
early_stopping=True
|
411 |
)
|
412 |
-
|
413 |
enhanced_query = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
414 |
-
|
415 |
# Clear CUDA cache if GPU was used
|
416 |
if device == "cuda":
|
417 |
torch.cuda.empty_cache()
|
418 |
-
|
419 |
except Exception as model_error:
|
420 |
print(f"Model-based expansion failed: {str(model_error)}")
|
421 |
enhanced_query = query
|
422 |
-
|
423 |
# 4. Combine original and expanded terms
|
424 |
final_terms = set(tokens)
|
425 |
final_terms.update(expanded_terms)
|
426 |
if enhanced_query != query:
|
427 |
final_terms.update(word_tokenize(enhanced_query.lower()))
|
428 |
-
|
429 |
# 5. Remove stopwords and select top_k most relevant terms
|
430 |
stopwords = set(['the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to'])
|
431 |
final_terms = [term for term in final_terms if term not in stopwords]
|
432 |
-
|
433 |
# Combine with original query
|
434 |
expanded_query = f"{query} {' '.join(list(final_terms)[:top_k])}"
|
435 |
-
|
436 |
# Clean up
|
437 |
del model
|
438 |
del tokenizer
|
439 |
if device == "cuda":
|
440 |
torch.cuda.empty_cache()
|
441 |
-
|
442 |
-
return [Document(page_content=expanded_query.strip())]
|
443 |
-
|
444 |
except Exception as e:
|
445 |
print(f"Query optimization failed: {str(e)}")
|
446 |
-
return [Document(page_content=query)] # Return original query if optimization fails
|
447 |
|
448 |
|
449 |
|
@@ -458,27 +458,27 @@ optimized_query = optimize_query(
|
|
458 |
use_gpu=False # Explicitly use CPU
|
459 |
)
|
460 |
"""
|
461 |
-
|
462 |
|
463 |
def create_custom_embedding(texts, model_type='word2vec', vector_size=100, window=5, min_count=1):
|
464 |
tokenized_texts = [text.split() for text in texts]
|
465 |
-
|
466 |
if model_type == 'word2vec':
|
467 |
model = Word2Vec(sentences=tokenized_texts, vector_size=vector_size, window=window, min_count=min_count, workers=4)
|
468 |
elif model_type == 'fasttext':
|
469 |
model = FastText(sentences=tokenized_texts, vector_size=vector_size, window=window, min_count=min_count, workers=4)
|
470 |
else:
|
471 |
raise ValueError("Unsupported model type")
|
472 |
-
|
473 |
return model
|
474 |
|
475 |
class CustomEmbeddings(HuggingFaceEmbeddings):
|
476 |
def __init__(self, model_path):
|
477 |
self.model = Word2Vec.load(model_path) # or FastText.load() for FastText models
|
478 |
-
|
479 |
def embed_documents(self, texts):
|
480 |
return [self.model.wv[text.split()] for text in texts]
|
481 |
-
|
482 |
def embed_query(self, text):
|
483 |
return self.model.wv[text.split()]
|
484 |
|
@@ -520,7 +520,7 @@ def get_text_splitter(split_strategy, chunk_size, overlap_size, custom_separator
|
|
520 |
chunk_size=chunk_size,
|
521 |
chunk_overlap=overlap_size,
|
522 |
add_start_index=True, # If `True`, includes chunk's start index in metadata
|
523 |
-
strip_whitespace=True, # If `True`, strips whitespace from the start and end of every document
|
524 |
separators=custom_separators or ["\n\n", "\n", " ", ""]
|
525 |
)
|
526 |
else:
|
@@ -534,7 +534,7 @@ def get_embedding_model(model_type, model_name):
|
|
534 |
multi_process=True,
|
535 |
# model_kwargs={"device": "cpu"},
|
536 |
#encode_kwargs={"normalize_embeddings": True}, # Set `True` for cosine similarity
|
537 |
-
)
|
538 |
elif model_type == 'OpenAI':
|
539 |
return OpenAIEmbeddings(model=model_path)
|
540 |
elif model_type == 'Cohere':
|
@@ -566,10 +566,10 @@ def custom_similarity(query_embedding, doc_embedding, query, doc_text, phonetic_
|
|
566 |
phonetic_sim = phonetic_match(doc_text, query)
|
567 |
combined_sim = (1 - phonetic_weight) * embedding_sim + phonetic_weight * phonetic_sim
|
568 |
return combined_sim
|
569 |
-
|
570 |
def _create_vector_store(vector_store_type, chunks_tuple, embedding_model):
|
571 |
chunks = list(chunks_tuple)
|
572 |
-
|
573 |
if vector_store_type == 'FAISS':
|
574 |
return FAISS.from_texts(chunks, embedding_model)
|
575 |
elif vector_store_type == 'Chroma':
|
@@ -587,7 +587,7 @@ def process_files(file_path, model_type, model_name, split_strategy, chunk_size,
|
|
587 |
for file in os.listdir(FILES_DIR):
|
588 |
file_path = os.path.join(FILES_DIR, file)
|
589 |
text += FileHandler.extract_text(file_path)
|
590 |
-
|
591 |
if custom_tokenizer_file:
|
592 |
tokenizer = create_custom_tokenizer(custom_tokenizer_file, custom_tokenizer_model, custom_tokenizer_vocab_size, custom_tokenizer_special_tokens)
|
593 |
text = ' '.join(custom_tokenize(text, tokenizer))
|
@@ -603,7 +603,7 @@ def process_files(file_path, model_type, model_name, split_strategy, chunk_size,
|
|
603 |
|
604 |
def search_embeddings(chunks, embedding_model, vector_store_type, search_type, query, top_k, expected_result=None, lang='german', apply_phonetic=False, phonetic_weight=0.3):
|
605 |
preprocessed_query = preprocess_text(query, lang) if apply_phonetic else query
|
606 |
-
|
607 |
vector_store = get_vector_store(vector_store_type, chunks, embedding_model)
|
608 |
retriever = get_retriever(vector_store, search_type, {"k": top_k})
|
609 |
|
@@ -613,10 +613,10 @@ def search_embeddings(chunks, embedding_model, vector_store_type, search_type, q
|
|
613 |
#this should be optional
|
614 |
def score_result(doc):
|
615 |
base_score = vector_store.similarity_search_with_score(doc.page_content, k=1)[0][1]
|
616 |
-
|
617 |
# Add bonus for containing expected result
|
618 |
expected_bonus = 0.3 if expected_result and expected_result in doc.page_content else 0
|
619 |
-
|
620 |
if apply_phonetic:
|
621 |
phonetic_score = phonetic_match(doc.page_content, query)
|
622 |
return (1 - phonetic_weight) * base_score + phonetic_weight * phonetic_score + expected_bonus
|
@@ -645,7 +645,7 @@ def search_embeddings(chunks, embedding_model, vector_store_type, search_type, q
|
|
645 |
# Enhanced Result Analysis
|
646 |
class ResultAnalyzer:
|
647 |
@staticmethod
|
648 |
-
def calculate_statistics(results, search_time, vector_store, num_tokens, embedding_model, query,
|
649 |
top_k, expected_result=None, model_feedback=None):
|
650 |
stats = {
|
651 |
"num_results": len(results),
|
@@ -657,7 +657,7 @@ class ResultAnalyzer:
|
|
657 |
"embedding_dimension": len(embedding_model.embed_query(query)),
|
658 |
"top_k": top_k,
|
659 |
}
|
660 |
-
|
661 |
# Add vector store statistics
|
662 |
try:
|
663 |
if hasattr(vector_store, '_index'):
|
@@ -666,13 +666,13 @@ class ResultAnalyzer:
|
|
666 |
stats["vector_store_size"] = len(vector_store._collection.get())
|
667 |
except:
|
668 |
stats["vector_store_size"] = "N/A"
|
669 |
-
|
670 |
# Add expected result statistics if provided
|
671 |
if expected_result:
|
672 |
stats["contains_expected"] = any(expected_result in doc.page_content for doc in results)
|
673 |
-
stats["expected_result_rank"] = next((i for i, doc in enumerate(results)
|
674 |
if expected_result in doc.page_content), -1) + 1
|
675 |
-
|
676 |
# Calculate diversity metrics for larger result sets
|
677 |
if len(results) > 3: # Changed from 1000 to make it more practical
|
678 |
embeddings = [embedding_model.embed_query(doc.page_content) for doc in results]
|
@@ -681,7 +681,7 @@ class ResultAnalyzer:
|
|
681 |
else:
|
682 |
stats["result_diversity"] = "N/A"
|
683 |
stats["silhouette_score"] = "N/A"
|
684 |
-
|
685 |
# Add ranking correlation
|
686 |
query_embedding = embedding_model.embed_query(query)
|
687 |
result_embeddings = [embedding_model.embed_query(doc.page_content) for doc in results]
|
@@ -691,20 +691,20 @@ class ResultAnalyzer:
|
|
691 |
stats["rank_correlation"] = rank_correlation
|
692 |
else:
|
693 |
stats["rank_correlation"] = "N/A"
|
694 |
-
|
695 |
# Add model feedback if provided
|
696 |
if model_feedback:
|
697 |
stats["model_feedback"] = model_feedback
|
698 |
-
|
699 |
return stats
|
700 |
-
|
701 |
@staticmethod
|
702 |
def _calculate_diversity(embeddings: List[np.ndarray]) -> float:
|
703 |
"""Calculate diversity score for embeddings"""
|
704 |
embeddings_array = np.array(embeddings)
|
705 |
pairwise_similarities = np.inner(embeddings_array, embeddings_array)
|
706 |
return 1 - np.mean(pairwise_similarities[np.triu_indices(len(embeddings), k=1)])
|
707 |
-
|
708 |
@staticmethod
|
709 |
def _calculate_silhouette(embeddings: List[np.ndarray]) -> float:
|
710 |
"""Calculate silhouette score for embeddings"""
|
@@ -724,13 +724,13 @@ def visualize_results(results_df, stats_df):
|
|
724 |
# Add model column if not present
|
725 |
if 'model' not in stats_df.columns:
|
726 |
stats_df['model'] = stats_df['model_type'] + ' - ' + stats_df['model_name']
|
727 |
-
|
728 |
fig, axs = plt.subplots(2, 2, figsize=(20, 20))
|
729 |
-
|
730 |
# Handle empty dataframe case
|
731 |
if len(stats_df) == 0:
|
732 |
return fig
|
733 |
-
|
734 |
# Create plots with error handling
|
735 |
try:
|
736 |
sns.barplot(data=stats_df, x='model', y='search_time', ax=axs[0, 0])
|
@@ -738,36 +738,36 @@ def visualize_results(results_df, stats_df):
|
|
738 |
axs[0, 0].tick_params(axis='x', rotation=45)
|
739 |
except Exception as e:
|
740 |
print(f"Error in search time plot: {e}")
|
741 |
-
|
742 |
try:
|
743 |
-
sns.scatterplot(data=stats_df, x='result_diversity', y='rank_correlation',
|
744 |
hue='model', ax=axs[0, 1])
|
745 |
axs[0, 1].set_title('Result Diversity vs. Rank Correlation')
|
746 |
except Exception as e:
|
747 |
print(f"Error in diversity plot: {e}")
|
748 |
-
|
749 |
try:
|
750 |
sns.boxplot(data=stats_df, x='model', y='avg_content_length', ax=axs[1, 0])
|
751 |
axs[1, 0].set_title('Distribution of Result Content Lengths')
|
752 |
axs[1, 0].tick_params(axis='x', rotation=45)
|
753 |
except Exception as e:
|
754 |
print(f"Error in content length plot: {e}")
|
755 |
-
|
756 |
try:
|
757 |
valid_embeddings = results_df['embedding'].dropna().values
|
758 |
if len(valid_embeddings) > 1:
|
759 |
tsne = TSNE(n_components=2, random_state=42)
|
760 |
embeddings_2d = tsne.fit_transform(np.vstack(valid_embeddings))
|
761 |
-
sns.scatterplot(x=embeddings_2d[:, 0], y=embeddings_2d[:, 1],
|
762 |
-
hue=results_df['Model'][:len(valid_embeddings)],
|
763 |
ax=axs[1, 1])
|
764 |
axs[1, 1].set_title('t-SNE Visualization of Result Embeddings')
|
765 |
else:
|
766 |
-
axs[1, 1].text(0.5, 0.5, "Not enough embeddings for visualization",
|
767 |
ha='center', va='center')
|
768 |
except Exception as e:
|
769 |
print(f"Error in embedding visualization: {e}")
|
770 |
-
|
771 |
plt.tight_layout()
|
772 |
return fig
|
773 |
|
@@ -778,56 +778,56 @@ def visualize_results(results_df, stats_df):
|
|
778 |
#plt.title("Distribution of document lengths in the knowledge base (in count of tokens)")
|
779 |
#plt.show()
|
780 |
|
781 |
-
|
782 |
def optimize_vocabulary(texts, vocab_size=10000, min_frequency=2):
|
783 |
tokenizer = Tokenizer(models.BPE(unk_token="[UNK]"))
|
784 |
|
785 |
word_freq = Counter(word for text in texts for word in text.split())
|
786 |
-
|
787 |
optimized_texts = [
|
788 |
' '.join(word for word in text.split() if word_freq[word] >= min_frequency)
|
789 |
for text in texts
|
790 |
]
|
791 |
-
|
792 |
trainer = trainers.BpeTrainer(vocab_size=vocab_size, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
|
793 |
tokenizer.train_from_iterator(optimized_texts, trainer)
|
794 |
-
|
795 |
return tokenizer, optimized_texts
|
796 |
-
|
797 |
import numpy as np
|
798 |
from transformers import TextClassificationPipeline
|
799 |
from typing import List, Union, Any
|
800 |
|
801 |
-
|
802 |
|
803 |
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
|
804 |
|
805 |
|
806 |
def rerank_results(
|
807 |
-
results: List[Any],
|
808 |
-
query: str,
|
809 |
reranker: Union[TextClassificationPipeline, Any]
|
810 |
) -> List[Any]:
|
811 |
"""
|
812 |
-
|
813 |
"""
|
814 |
if not results:
|
815 |
return results
|
816 |
-
|
817 |
# Step 1: Encode the query and documents using SentenceTransformer
|
818 |
query_embedding = model.encode(query, convert_to_tensor=True)
|
819 |
doc_contents = [doc.page_content for doc in results] # Assuming each result has a `page_content` attribute
|
820 |
doc_embeddings = model.encode(doc_contents, convert_to_tensor=True)
|
821 |
-
|
822 |
# Step 2: Compute cosine similarities between query and document embeddings
|
823 |
cosine_scores = util.cos_sim(query_embedding, doc_embeddings)[0] # Shape: (number of documents,)
|
824 |
-
|
825 |
# Step 3: Sort documents by similarity score in descending order
|
826 |
-
reranked_idx = np.argsort(cosine_scores.numpy())[::-1]
|
827 |
-
|
828 |
# Step 4: Return the reranked documents
|
829 |
reranked_results = [results[i] for i in reranked_idx]
|
830 |
-
|
831 |
return reranked_results
|
832 |
|
833 |
|
@@ -878,13 +878,13 @@ def compare_embeddings(file, query, embedding_models, custom_embedding_model, sp
|
|
878 |
if optimize_vocab:
|
879 |
tokenizer, optimized_chunks = optimize_vocabulary(chunks)
|
880 |
chunks = optimized_chunks
|
881 |
-
|
882 |
search_query = query
|
883 |
-
|
884 |
if use_query_optimization:
|
885 |
optimized_queries = optimize_query(query, query_optimization_model, chunks, embedding_model, vector_store_type, search_type, top_k)
|
886 |
#query = " ".join(optimized_queries)
|
887 |
-
search_query = " ".join([doc.page_content for doc in optimized_queries]) # Extract text from Document objects
|
888 |
|
889 |
results, search_time, vector_store, results_raw = search_embeddings(
|
890 |
chunks,
|
@@ -897,8 +897,8 @@ def compare_embeddings(file, query, embedding_models, custom_embedding_model, sp
|
|
897 |
lang,
|
898 |
apply_phonetic,
|
899 |
phonetic_weight
|
900 |
-
)
|
901 |
-
|
902 |
if use_reranking:
|
903 |
reranker = pipeline("text-classification", model="cross-encoder/ms-marco-MiniLM-L-12-v2")
|
904 |
results_raw = rerank_results(results_raw, query, reranker)
|
@@ -953,7 +953,7 @@ from tqdm import tqdm
|
|
953 |
def automated_testing(file, query, test_params, expected_result=None):
|
954 |
all_results = []
|
955 |
all_stats = []
|
956 |
-
|
957 |
param_grid = ParameterGrid(test_params)
|
958 |
print(param_grid)
|
959 |
for params in tqdm(param_grid, desc="Running tests"):
|
@@ -995,7 +995,7 @@ def automated_testing(file, query, test_params, expected_result=None):
|
|
995 |
params['apply_phonetic'],
|
996 |
params['phonetic_weight']
|
997 |
)
|
998 |
-
|
999 |
if params['use_reranking']:
|
1000 |
reranker = pipeline("text-classification", model="cross-encoder/ms-marco-MiniLM-L-12-v2")
|
1001 |
results_raw = rerank_results(results_raw, query, reranker)
|
@@ -1022,17 +1022,27 @@ def analyze_results(stats_df):
|
|
1022 |
'contains_expected': 0.5, # High weight for containing the expected result
|
1023 |
'expected_result_rank': -0.4 # Lower rank (closer to 1) is better
|
1024 |
}
|
1025 |
-
|
|
|
|
|
|
|
1026 |
for metric in metric_weights.keys():
|
1027 |
-
|
1028 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1029 |
stats_df['weighted_score'] = sum(
|
1030 |
-
stats_df[metric].fillna(0) * weight
|
1031 |
for metric, weight in metric_weights.items()
|
1032 |
)
|
1033 |
-
|
1034 |
best_config = stats_df.loc[stats_df['weighted_score'].idxmax()]
|
1035 |
-
|
1036 |
recommendations = {
|
1037 |
'best_model': f"{best_config['model_type']} - {best_config['model_name']}",
|
1038 |
'best_settings': {
|
@@ -1059,7 +1069,7 @@ def analyze_results(stats_df):
|
|
1059 |
'expected_result_rank': int(best_config['expected_result_rank'])
|
1060 |
}
|
1061 |
}
|
1062 |
-
|
1063 |
return recommendations
|
1064 |
|
1065 |
####
|
@@ -1069,72 +1079,85 @@ def get_llm_suggested_settings(file, num_chunks=1):
|
|
1069 |
return {"error": "No file uploaded"}
|
1070 |
|
1071 |
chunks, _, _ = process_files(
|
1072 |
-
file.name,
|
1073 |
-
'HuggingFace',
|
1074 |
-
'paraphrase-miniLM',
|
1075 |
-
'recursive',
|
1076 |
-
250,
|
1077 |
50,
|
1078 |
custom_separators=None
|
1079 |
)
|
1080 |
-
|
1081 |
# Select a few random chunks
|
1082 |
sample_chunks = random.sample(chunks, min(num_chunks, len(chunks)))
|
1083 |
-
|
1084 |
-
|
1085 |
-
|
1086 |
-
|
1087 |
-
|
1088 |
-
|
1089 |
-
|
1090 |
-
|
1091 |
-
|
1092 |
-
|
1093 |
-
|
1094 |
-
|
1095 |
-
|
1096 |
-
|
1097 |
-
|
1098 |
-
|
1099 |
-
|
1100 |
-
|
1101 |
-
|
1102 |
-
|
1103 |
-
|
1104 |
-
|
1105 |
-
|
1106 |
-
|
1107 |
-
|
1108 |
-
|
1109 |
-
|
1110 |
-
|
1111 |
-
|
1112 |
-
|
1113 |
-
|
1114 |
-
|
1115 |
-
|
1116 |
-
|
1117 |
-
|
1118 |
-
|
1119 |
-
|
1120 |
-
|
1121 |
-
|
1122 |
-
|
1123 |
-
|
1124 |
-
|
1125 |
-
|
1126 |
-
|
1127 |
-
|
1128 |
-
|
1129 |
-
|
1130 |
-
|
1131 |
-
|
1132 |
-
|
1133 |
-
|
1134 |
-
|
1135 |
-
|
1136 |
-
|
1137 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1138 |
print("setting suggested")
|
1139 |
print(suggested_settings)
|
1140 |
# Parse the generated text to extract the dictionary
|
@@ -1160,7 +1183,7 @@ Provide your suggestions in a Python dictionary format."""
|
|
1160 |
def update_inputs_with_llm_suggestions(suggestions):
|
1161 |
if suggestions is None or "error" in suggestions:
|
1162 |
return [gr.update() for _ in range(11)] # Return no updates if there's an error or None
|
1163 |
-
|
1164 |
return [
|
1165 |
gr.update(value=[suggestions["embedding_models"]]), # embedding_models_input
|
1166 |
gr.update(value=suggestions["split_strategy"]), # split_strategy_input
|
@@ -1178,16 +1201,16 @@ def update_inputs_with_llm_suggestions(suggestions):
|
|
1178 |
def parse_model_selections(default_models, custom_models):
|
1179 |
"""
|
1180 |
Parse selected default models and custom models into model configurations
|
1181 |
-
|
1182 |
Args:
|
1183 |
default_models (List[str]): Selected default models in format "type:name"
|
1184 |
custom_models (str): Custom models string with one model per line in format "type:name"
|
1185 |
-
|
1186 |
Returns:
|
1187 |
List[Dict[str, str]]: List of model configurations with 'type' and 'name' keys
|
1188 |
"""
|
1189 |
model_configs = []
|
1190 |
-
|
1191 |
# Process default models
|
1192 |
if default_models:
|
1193 |
for model in default_models:
|
@@ -1196,7 +1219,7 @@ def parse_model_selections(default_models, custom_models):
|
|
1196 |
'type': model_type,
|
1197 |
'name': model_name
|
1198 |
})
|
1199 |
-
|
1200 |
# Process custom models
|
1201 |
if custom_models:
|
1202 |
custom_model_lines = custom_models.strip().split('\n')
|
@@ -1207,7 +1230,7 @@ def parse_model_selections(default_models, custom_models):
|
|
1207 |
'type': model_type.strip(),
|
1208 |
'name': model_name.strip()
|
1209 |
})
|
1210 |
-
|
1211 |
return model_configs
|
1212 |
|
1213 |
def parse_comma_separated(text):
|
@@ -1217,12 +1240,12 @@ def parse_comma_separated(text):
|
|
1217 |
return [x.strip() for x in text.split(',') if x.strip()]
|
1218 |
|
1219 |
|
1220 |
-
|
1221 |
# Gradio Interface
|
1222 |
def launch_interface(debug=True):
|
1223 |
with gr.Blocks() as iface:
|
1224 |
gr.Markdown("# Advanced Embedding Comparison Tool")
|
1225 |
-
|
1226 |
with gr.Tab("Simple"):
|
1227 |
file_input = gr.File(label="Upload File (Optional)")
|
1228 |
query_input = gr.Textbox(label="Search Query")
|
@@ -1237,7 +1260,7 @@ def launch_interface(debug=True):
|
|
1237 |
label="Embedding Models"
|
1238 |
)
|
1239 |
top_k_input = gr.Slider(1, 10, step=1, value=5, label="Top K")
|
1240 |
-
|
1241 |
with gr.Tab("Advanced"):
|
1242 |
custom_embedding_model_input = gr.Textbox(label="Custom Embedding Model (optional, format: type:name)")
|
1243 |
split_strategy_input = gr.Radio(choices=["token", "recursive"], label="Split Strategy", value="recursive")
|
@@ -1247,7 +1270,7 @@ def launch_interface(debug=True):
|
|
1247 |
vector_store_type_input = gr.Radio(choices=["FAISS", "Chroma"], label="Vector Store Type", value="FAISS")
|
1248 |
search_type_input = gr.Radio(choices=["similarity", "mmr", "custom"], label="Search Type", value="similarity")
|
1249 |
lang_input = gr.Dropdown(choices=["german", "english", "french"], label="Language", value="german")
|
1250 |
-
|
1251 |
with gr.Tab("Expert"):
|
1252 |
apply_preprocessing_input = gr.Checkbox(label="Apply Text Preprocessing", value=False)
|
1253 |
optimize_vocab_input = gr.Checkbox(label="Optimize Vocabulary", value=False)
|
@@ -1265,7 +1288,7 @@ def launch_interface(debug=True):
|
|
1265 |
with gr.Row():
|
1266 |
auto_file_input = gr.File(label="Upload File (Optional)")
|
1267 |
auto_query_input = gr.Textbox(label="Search Query")
|
1268 |
-
|
1269 |
with gr.Row():
|
1270 |
auto_expected_result_input = gr.Textbox(
|
1271 |
label="Expected Result (Optional)",
|
@@ -1275,18 +1298,18 @@ def launch_interface(debug=True):
|
|
1275 |
label="Model Feedback (Optional)",
|
1276 |
placeholder="Enter any feedback about model performance"
|
1277 |
)
|
1278 |
-
|
1279 |
with gr.Row():
|
1280 |
with gr.Column():
|
1281 |
# Default model selection
|
1282 |
default_models_input = gr.CheckboxGroup(
|
1283 |
-
choices=[f"{type}:{name}"
|
1284 |
-
for type, names in DEFAULT_MODELS.items()
|
1285 |
for name in names],
|
1286 |
label="Default Models",
|
1287 |
value=[f"HuggingFace:{DEFAULT_MODELS['HuggingFace'][0]}"]
|
1288 |
)
|
1289 |
-
|
1290 |
with gr.Column():
|
1291 |
# Custom model input
|
1292 |
custom_models_input = gr.TextArea(
|
@@ -1294,7 +1317,7 @@ def launch_interface(debug=True):
|
|
1294 |
placeholder="Enter one model per line in format: type:name",
|
1295 |
lines=3
|
1296 |
)
|
1297 |
-
|
1298 |
auto_split_strategies = gr.CheckboxGroup(
|
1299 |
choices=["token", "recursive"],
|
1300 |
label="Split Strategies to Test"
|
@@ -1313,21 +1336,21 @@ def launch_interface(debug=True):
|
|
1313 |
auto_optimize_vocab = gr.Checkbox(label="Test Vocabulary Optimization", value=True)
|
1314 |
auto_use_query_optimization = gr.Checkbox(label="Test Query Optimization", value=True)
|
1315 |
auto_use_reranking = gr.Checkbox(label="Test Reranking", value=True)
|
1316 |
-
|
1317 |
auto_results_output = gr.Dataframe(label="Automated Test Results", interactive=False)
|
1318 |
auto_stats_output = gr.Dataframe(label="Automated Test Statistics", interactive=False)
|
1319 |
recommendations_output = gr.JSON(label="Recommendations")
|
1320 |
-
|
1321 |
def run_automation(file_input, query_input, expected_result, default_models, custom_models,
|
1322 |
split_strategies, chunk_sizes, overlap_sizes,
|
1323 |
vector_store_types, search_types, top_k_values,
|
1324 |
optimize_vocab, use_query_optimization, use_reranking,
|
1325 |
model_feedback):
|
1326 |
"""Wrapper function to handle Gradio inputs and run automated tests"""
|
1327 |
-
|
1328 |
# Parse model configurations
|
1329 |
model_configs = parse_model_selections(default_models, custom_models)
|
1330 |
-
|
1331 |
# Parse test parameters
|
1332 |
test_params = {
|
1333 |
'split_strategy': split_strategies,
|
@@ -1346,7 +1369,7 @@ def launch_interface(debug=True):
|
|
1346 |
'custom_separators': [None],
|
1347 |
'query_optimization_model': ['google/flan-t5-base'] # Default query optimization model
|
1348 |
}
|
1349 |
-
|
1350 |
# Run automated tests
|
1351 |
results_df, stats_df = run_automated_tests(
|
1352 |
file_input.name if file_input else None,
|
@@ -1356,12 +1379,12 @@ def launch_interface(debug=True):
|
|
1356 |
expected_result if expected_result else None,
|
1357 |
model_feedback if model_feedback else None
|
1358 |
)
|
1359 |
-
|
1360 |
# Generate recommendations based on results
|
1361 |
recommendations = analyze_results(stats_df)
|
1362 |
-
|
1363 |
return results_df, stats_df, recommendations
|
1364 |
-
|
1365 |
auto_submit_button = gr.Button("Run Automated Tests")
|
1366 |
auto_submit_button.click(
|
1367 |
fn=run_automation,
|
@@ -1376,25 +1399,25 @@ def launch_interface(debug=True):
|
|
1376 |
outputs=[auto_results_output, auto_stats_output, recommendations_output]
|
1377 |
)
|
1378 |
###
|
1379 |
-
|
1380 |
with gr.Tab("Results"):
|
1381 |
with gr.Row():
|
1382 |
results_output = gr.DataFrame(label="Results")
|
1383 |
stats_output = gr.DataFrame(label="Statistics")
|
1384 |
-
|
1385 |
with gr.Row():
|
1386 |
plot_output = gr.Plot(label="Visualizations")
|
1387 |
model_rankings_output = gr.JSON(label="Model Rankings")
|
1388 |
-
|
1389 |
with gr.Row():
|
1390 |
recommendations_output = gr.JSON(label="Recommendations")
|
1391 |
-
|
1392 |
with gr.Tab("LLM Suggestions"):
|
1393 |
llm_file_input = gr.File(label="Upload File for LLM Suggestions")
|
1394 |
llm_num_chunks = gr.Slider(1, 10, step=1, value=5, label="Number of Sample Chunks")
|
1395 |
llm_suggest_button = gr.Button("Get LLM Suggestions")
|
1396 |
llm_suggestions_output = gr.JSON(label="LLM-suggested Settings")
|
1397 |
-
|
1398 |
llm_suggest_button.click(
|
1399 |
fn=get_llm_suggested_settings,
|
1400 |
inputs=[llm_file_input, llm_num_chunks],
|
@@ -1403,9 +1426,9 @@ def launch_interface(debug=True):
|
|
1403 |
fn=update_inputs_with_llm_suggestions,
|
1404 |
inputs=[llm_suggestions_output],
|
1405 |
outputs=[
|
1406 |
-
embedding_models_input, split_strategy_input, chunk_size_input,
|
1407 |
-
overlap_size_input, vector_store_type_input, search_type_input,
|
1408 |
-
top_k_input, apply_preprocessing_input, optimize_vocab_input,
|
1409 |
apply_phonetic_input, phonetic_weight_input
|
1410 |
]
|
1411 |
)
|
@@ -1526,7 +1549,7 @@ Create a simple chat interface and test with various queries about the AI Act. F
|
|
1526 |
User: "Was sind die Hauptziele des KI-Gesetzes?"
|
1527 |
"""
|
1528 |
|
1529 |
-
|
1530 |
tutorial_md = """
|
1531 |
# Advanced Embedding Comparison Tool Tutorial
|
1532 |
|
@@ -1675,13 +1698,13 @@ Measures how well an object fits within its own cluster compared to others. Scor
|
|
1675 |
def create_custom_tokenizer(file_path, model_type='WordLevel', vocab_size=10000, special_tokens=None):
|
1676 |
with open(file_path, 'r', encoding='utf-8') as f:
|
1677 |
text = f.read()
|
1678 |
-
|
1679 |
tokenizer = Tokenizer(models.WordLevel(unk_token="[UNK]")) if model_type == 'WordLevel' else Tokenizer(models.BPE(unk_token="[UNK]"))
|
1680 |
tokenizer.pre_tokenizer = Whitespace()
|
1681 |
-
|
1682 |
trainer = trainers.WordLevelTrainer(special_tokens=special_tokens or ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"], vocab_size=vocab_size)
|
1683 |
tokenizer.train_from_iterator([text], trainer)
|
1684 |
-
|
1685 |
return tokenizer
|
1686 |
````
|
1687 |
|
@@ -1713,39 +1736,39 @@ def rerank_results(results, query, reranker):
|
|
1713 |
|
1714 |
|
1715 |
## Useful Resources and Links
|
1716 |
-
|
1717 |
Here are some valuable resources to help you better understand and work with embeddings, retrieval systems, and natural language processing:
|
1718 |
-
|
1719 |
### Embeddings and Vector Databases
|
1720 |
- [Understanding Embeddings](https://www.tensorflow.org/text/guide/word_embeddings): A guide by TensorFlow on word embeddings
|
1721 |
- [FAISS: A Library for Efficient Similarity Search](https://github.com/facebookresearch/faiss): Facebook AI's vector similarity search library
|
1722 |
- [Chroma: The AI-native open-source embedding database](https://www.trychroma.com/): An embedding database designed for AI applications
|
1723 |
-
|
1724 |
### Natural Language Processing
|
1725 |
- [NLTK (Natural Language Toolkit)](https://www.nltk.org/): A leading platform for building Python programs to work with human language data
|
1726 |
- [spaCy](https://spacy.io/): Industrial-strength Natural Language Processing in Python
|
1727 |
- [Hugging Face Transformers](https://huggingface.co/transformers/): State-of-the-art Natural Language Processing for PyTorch and TensorFlow 2.0
|
1728 |
-
|
1729 |
### Retrieval-Augmented Generation (RAG)
|
1730 |
- [LangChain](https://python.langchain.com/docs/get_started/introduction): A framework for developing applications powered by language models
|
1731 |
- [OpenAI's RAG Tutorial](https://platform.openai.com/docs/tutorials/web-qa-embeddings): A guide on building a QA system with embeddings
|
1732 |
-
|
1733 |
### German Language Processing
|
1734 |
- [Kölner Phonetik](https://en.wikipedia.org/wiki/Cologne_phonetics): Information about the Kölner Phonetik algorithm
|
1735 |
- [German NLP Resources](https://github.com/adbar/German-NLP): A curated list of open-access resources for German NLP
|
1736 |
-
|
1737 |
### Benchmarks and Evaluation
|
1738 |
- [MTEB Leaderboard](https://huggingface.co/spaces/mteb/leaderboard): Massive Text Embedding Benchmark leaderboard
|
1739 |
- [GLUE Benchmark](https://gluebenchmark.com/): General Language Understanding Evaluation benchmark
|
1740 |
-
|
1741 |
### Tools and Libraries
|
1742 |
- [Gensim](https://radimrehurek.com/gensim/): Topic modelling for humans
|
1743 |
- [Sentence-Transformers](https://www.sbert.net/): A Python framework for state-of-the-art sentence, text and image embeddings
|
1744 |
-
|
1745 |
### Support me
|
1746 |
- [Visual Crew Builder](https://visual-crew.builder.ai/): Tool for create AI systems, workflows and api. Or just a notebook.
|
1747 |
-
|
1748 |
-
|
1749 |
|
1750 |
This tool empowers you to fine-tune your RAG system for optimal performance. Experiment with different settings, run automated tests, and use insights to create an efficient information retrieval and generation system.
|
1751 |
|
@@ -1768,7 +1791,7 @@ def create_chat_app(settings):
|
|
1768 |
settings['lang'],
|
1769 |
settings['apply_preprocessing']
|
1770 |
)
|
1771 |
-
|
1772 |
results, _, _, _ = search_embeddings(
|
1773 |
chunks,
|
1774 |
embedding_model,
|
@@ -1780,12 +1803,12 @@ def create_chat_app(settings):
|
|
1780 |
apply_phonetic=settings['apply_phonetic'],
|
1781 |
phonetic_weight=settings['phonetic_weight']
|
1782 |
)
|
1783 |
-
|
1784 |
# Generate a response based on the retrieved results
|
1785 |
response = f"Based on the query '{message}', here are the top {settings['top_k']} relevant results:\n\n"
|
1786 |
for i, result in enumerate(results[:settings['top_k']]):
|
1787 |
response += f"{i+1}. {result['content'][:100]}...\n\n"
|
1788 |
-
|
1789 |
return response
|
1790 |
|
1791 |
with gr.Blocks() as chat_interface:
|
@@ -1823,7 +1846,7 @@ if __name__ == "__main__":
|
|
1823 |
launch_interface()
|
1824 |
# Uncomment the following line to launch the sample chat app
|
1825 |
´´´
|
1826 |
-
|
1827 |
"""
|
1828 |
|
1829 |
|
@@ -1832,10 +1855,10 @@ if __name__ == "__main__":
|
|
1832 |
["Embedding Comparison", "Tutorial", "Use Case"]
|
1833 |
)
|
1834 |
|
1835 |
-
iface.launch(debug=
|
1836 |
|
1837 |
# Enhanced Automated Testing
|
1838 |
-
def run_automated_tests(file_path: str, query: str, model_configs: List[Dict[str, str]],
|
1839 |
test_params: Dict[str, List[Any]], expected_result: Optional[str] = None,
|
1840 |
model_feedback: Optional[str] = None) -> Tuple[pd.DataFrame, pd.DataFrame]:
|
1841 |
"""
|
@@ -1844,16 +1867,16 @@ def run_automated_tests(file_path: str, query: str, model_configs: List[Dict[str
|
|
1844 |
all_results = []
|
1845 |
all_stats = []
|
1846 |
model_manager = ModelManager()
|
1847 |
-
|
1848 |
# Create parameter grid excluding model configurations
|
1849 |
base_params = {k: v for k, v in test_params.items() if k not in ['model_type', 'model_name']}
|
1850 |
param_grid = ParameterGrid(base_params)
|
1851 |
-
|
1852 |
# Test each model configuration with all parameter combinations
|
1853 |
for model_config in tqdm(model_configs, desc="Testing models"):
|
1854 |
model_type = model_config['type']
|
1855 |
model_name = model_config['name']
|
1856 |
-
|
1857 |
for params in tqdm(param_grid, desc=f"Testing parameters for {model_type}:{model_name}"):
|
1858 |
try:
|
1859 |
# Process files and get chunks
|
@@ -1868,11 +1891,11 @@ def run_automated_tests(file_path: str, query: str, model_configs: List[Dict[str
|
|
1868 |
params['lang'],
|
1869 |
params['apply_preprocessing']
|
1870 |
)
|
1871 |
-
|
1872 |
# Apply vocabulary optimization if specified
|
1873 |
if params['optimize_vocab']:
|
1874 |
tokenizer, chunks = optimize_vocabulary(chunks)
|
1875 |
-
|
1876 |
# Apply query optimization if specified
|
1877 |
current_query = query
|
1878 |
if params['use_query_optimization']:
|
@@ -1886,7 +1909,7 @@ def run_automated_tests(file_path: str, query: str, model_configs: List[Dict[str
|
|
1886 |
params['top_k']
|
1887 |
)
|
1888 |
current_query = " ".join(optimized_queries)
|
1889 |
-
|
1890 |
# Perform search
|
1891 |
results, search_time, vector_store, raw_results = search_embeddings(
|
1892 |
chunks,
|
@@ -1900,25 +1923,25 @@ def run_automated_tests(file_path: str, query: str, model_configs: List[Dict[str
|
|
1900 |
params['apply_phonetic'],
|
1901 |
params['phonetic_weight']
|
1902 |
)
|
1903 |
-
|
1904 |
# Apply reranking if specified
|
1905 |
if params['use_reranking']:
|
1906 |
-
reranker = pipeline("text-classification",
|
1907 |
model="cross-encoder/ms-marco-MiniLM-L-12-v2")
|
1908 |
raw_results = rerank_results(raw_results, current_query, reranker)
|
1909 |
-
|
1910 |
# Calculate statistics
|
1911 |
stats = ResultAnalyzer.calculate_statistics(
|
1912 |
raw_results, search_time, vector_store, num_tokens,
|
1913 |
embedding_model, current_query, params['top_k'],
|
1914 |
expected_result, model_feedback
|
1915 |
)
|
1916 |
-
|
1917 |
# Update model rankings
|
1918 |
model_id = f"{model_type}:{model_name}"
|
1919 |
ranking_score = calculate_model_ranking_score(stats)
|
1920 |
model_manager.update_model_ranking(model_id, ranking_score, model_feedback)
|
1921 |
-
|
1922 |
# Add model information to stats
|
1923 |
stats.update({
|
1924 |
"model_type": model_type,
|
@@ -1926,15 +1949,15 @@ def run_automated_tests(file_path: str, query: str, model_configs: List[Dict[str
|
|
1926 |
"model": f"{model_type} - {model_name}",
|
1927 |
**params
|
1928 |
})
|
1929 |
-
|
1930 |
# Format and store results
|
1931 |
all_results.extend(format_results(raw_results, stats))
|
1932 |
all_stats.append(stats)
|
1933 |
-
|
1934 |
except Exception as e:
|
1935 |
print(f"Error testing {model_type}:{model_name} with parameters {params}: {str(e)}")
|
1936 |
continue
|
1937 |
-
|
1938 |
return pd.DataFrame(all_results), pd.DataFrame(all_stats)
|
1939 |
|
1940 |
# Helper function to calculate model ranking score
|
@@ -1947,7 +1970,7 @@ def calculate_model_ranking_score(stats: Dict[str, Any]) -> float:
|
|
1947 |
'contains_expected': 0.3,
|
1948 |
'expected_result_rank': -0.2 # Negative weight because lower rank is better
|
1949 |
}
|
1950 |
-
|
1951 |
score = 0.0
|
1952 |
for metric, weight in weights.items():
|
1953 |
if metric in stats and not isinstance(stats[metric], str):
|
@@ -1958,9 +1981,8 @@ def calculate_model_ranking_score(stats: Dict[str, Any]) -> float:
|
|
1958 |
else:
|
1959 |
value = float(stats[metric])
|
1960 |
score += weight * value
|
1961 |
-
|
1962 |
return score
|
1963 |
|
1964 |
if __name__ == "__main__":
|
1965 |
launch_interface()
|
1966 |
-
|
|
|
41 |
from typing import List, Tuple, Optional
|
42 |
|
43 |
|
44 |
+
#hf_token = os.getenv("hf_token")
|
45 |
+
#login(token=hf_token)
|
46 |
|
47 |
# Define the model pipeline with additional generation parameters
|
48 |
#model_pipeline = pipeline(
|
|
|
154 |
}
|
155 |
}
|
156 |
|
157 |
+
|
158 |
def update_model_ranking(self, model_id: str, score: float, feedback: str = None):
|
159 |
"""Update model ranking based on performance and optional feedback"""
|
160 |
current_score = self.rankings.get(model_id, 0.0)
|
161 |
# Weighted average of current score and new score
|
162 |
self.rankings[model_id] = 0.7 * current_score + 0.3 * score
|
163 |
+
|
164 |
if feedback:
|
165 |
if model_id not in self.model_stats:
|
166 |
self.model_stats[model_id] = {"feedback_count": 0, "feedback": []}
|
167 |
self.model_stats[model_id]["feedback_count"] += 1
|
168 |
self.model_stats[model_id]["feedback"].append(feedback)
|
169 |
+
|
170 |
def get_top_models(self, n: int = 5) -> List[Tuple[str, float]]:
|
171 |
"""Get top n ranked models"""
|
172 |
return sorted(self.rankings.items(), key=lambda x: x[1], reverse=True)[:n]
|
173 |
+
|
174 |
def get_model_stats(self, model_id: str) -> Dict[str, Any]:
|
175 |
"""Get statistics for a specific model"""
|
176 |
return self.model_stats.get(model_id, {})
|
177 |
|
178 |
+
|
179 |
def add_model(self, provider, name, model_path):
|
180 |
if provider not in self.models:
|
181 |
self.models[provider] = {}
|
|
|
286 |
def preprocess_text(text, lang='german', apply_preprocessing=False):
|
287 |
if not apply_preprocessing:
|
288 |
return text
|
289 |
+
|
290 |
text = text.lower()
|
291 |
text = re.sub(r'[^a-zA-Z\s]', '', text)
|
292 |
+
|
293 |
try:
|
294 |
tokens = word_tokenize(text, language=lang)
|
295 |
except LookupError:
|
296 |
print(f"Warning: NLTK punkt tokenizer for {lang} not found. Using simple tokenization.")
|
297 |
tokens = simple_tokenize(text)
|
298 |
+
|
299 |
try:
|
300 |
stop_words = set(stopwords.words(lang))
|
301 |
except LookupError:
|
302 |
print(f"Warning: Stopwords for {lang} not found. Skipping stopword removal.")
|
303 |
stop_words = set()
|
304 |
tokens = [token for token in tokens if token not in stop_words]
|
305 |
+
|
306 |
try:
|
307 |
stemmer = SnowballStemmer(lang)
|
308 |
tokens = [stemmer.stem(token) for token in tokens]
|
309 |
except ValueError:
|
310 |
print(f"Warning: SnowballStemmer for {lang} not available. Skipping stemming.")
|
311 |
+
|
312 |
return ' '.join(tokens)
|
313 |
|
314 |
def phonetic_match(text, query, method='levenshtein_distance', apply_phonetic=False):
|
|
|
341 |
) -> str:
|
342 |
"""
|
343 |
CPU-optimized version of query expansion using a small language model.
|
344 |
+
|
345 |
Args:
|
346 |
query: Original search query
|
347 |
query_optimization_model: Name or path of the model to use for optimization
|
|
|
351 |
search_type: Type of search being performed
|
352 |
top_k: Number of expansion terms to add
|
353 |
use_gpu: Whether to use GPU if available (defaults to False for CPU)
|
354 |
+
|
355 |
Returns:
|
356 |
Expanded query string
|
357 |
"""
|
358 |
try:
|
359 |
# Set device
|
360 |
device = "cuda" if use_gpu and torch.cuda.is_available() else "cpu"
|
361 |
+
|
362 |
# 1. Basic text preprocessing (CPU-based)
|
363 |
tokens = word_tokenize(query.lower())
|
364 |
+
|
365 |
# 2. WordNet synonyms expansion (CPU-based)
|
366 |
expanded_terms = set()
|
367 |
for token in tokens:
|
|
|
370 |
for syn in synsets:
|
371 |
# Limit number of lemmas
|
372 |
expanded_terms.update([lemma.name() for lemma in syn.lemmas()[:2]])
|
373 |
+
|
374 |
# 3. Use provided model with reduced complexity
|
375 |
try:
|
376 |
# Load model with reduced memory footprint
|
|
|
384 |
low_cpu_mem_usage=True,
|
385 |
device_map="cpu"
|
386 |
)
|
387 |
+
|
388 |
# Move model to CPU and eval mode
|
389 |
model = model.to(device)
|
390 |
model.eval()
|
391 |
+
|
392 |
# Prepare input with reduced length
|
393 |
prompt = f"Enhance this search query with relevant terms: {query}"
|
394 |
inputs = tokenizer(
|
|
|
398 |
truncation=True,
|
399 |
padding=True
|
400 |
)
|
401 |
+
|
402 |
# Generate with minimal parameters
|
403 |
with torch.no_grad():
|
404 |
outputs = model.generate(
|
|
|
409 |
do_sample=False,
|
410 |
early_stopping=True
|
411 |
)
|
412 |
+
|
413 |
enhanced_query = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
414 |
+
|
415 |
# Clear CUDA cache if GPU was used
|
416 |
if device == "cuda":
|
417 |
torch.cuda.empty_cache()
|
418 |
+
|
419 |
except Exception as model_error:
|
420 |
print(f"Model-based expansion failed: {str(model_error)}")
|
421 |
enhanced_query = query
|
422 |
+
|
423 |
# 4. Combine original and expanded terms
|
424 |
final_terms = set(tokens)
|
425 |
final_terms.update(expanded_terms)
|
426 |
if enhanced_query != query:
|
427 |
final_terms.update(word_tokenize(enhanced_query.lower()))
|
428 |
+
|
429 |
# 5. Remove stopwords and select top_k most relevant terms
|
430 |
stopwords = set(['the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to'])
|
431 |
final_terms = [term for term in final_terms if term not in stopwords]
|
432 |
+
|
433 |
# Combine with original query
|
434 |
expanded_query = f"{query} {' '.join(list(final_terms)[:top_k])}"
|
435 |
+
|
436 |
# Clean up
|
437 |
del model
|
438 |
del tokenizer
|
439 |
if device == "cuda":
|
440 |
torch.cuda.empty_cache()
|
441 |
+
|
442 |
+
return expanded_query.strip() #[Document(page_content=expanded_query.strip())]
|
443 |
+
|
444 |
except Exception as e:
|
445 |
print(f"Query optimization failed: {str(e)}")
|
446 |
+
return query #[Document(page_content=query)] # Return original query if optimization fails
|
447 |
|
448 |
|
449 |
|
|
|
458 |
use_gpu=False # Explicitly use CPU
|
459 |
)
|
460 |
"""
|
461 |
+
|
462 |
|
463 |
def create_custom_embedding(texts, model_type='word2vec', vector_size=100, window=5, min_count=1):
|
464 |
tokenized_texts = [text.split() for text in texts]
|
465 |
+
|
466 |
if model_type == 'word2vec':
|
467 |
model = Word2Vec(sentences=tokenized_texts, vector_size=vector_size, window=window, min_count=min_count, workers=4)
|
468 |
elif model_type == 'fasttext':
|
469 |
model = FastText(sentences=tokenized_texts, vector_size=vector_size, window=window, min_count=min_count, workers=4)
|
470 |
else:
|
471 |
raise ValueError("Unsupported model type")
|
472 |
+
|
473 |
return model
|
474 |
|
475 |
class CustomEmbeddings(HuggingFaceEmbeddings):
|
476 |
def __init__(self, model_path):
|
477 |
self.model = Word2Vec.load(model_path) # or FastText.load() for FastText models
|
478 |
+
|
479 |
def embed_documents(self, texts):
|
480 |
return [self.model.wv[text.split()] for text in texts]
|
481 |
+
|
482 |
def embed_query(self, text):
|
483 |
return self.model.wv[text.split()]
|
484 |
|
|
|
520 |
chunk_size=chunk_size,
|
521 |
chunk_overlap=overlap_size,
|
522 |
add_start_index=True, # If `True`, includes chunk's start index in metadata
|
523 |
+
strip_whitespace=True, # If `True`, strips whitespace from the start and end of every document
|
524 |
separators=custom_separators or ["\n\n", "\n", " ", ""]
|
525 |
)
|
526 |
else:
|
|
|
534 |
multi_process=True,
|
535 |
# model_kwargs={"device": "cpu"},
|
536 |
#encode_kwargs={"normalize_embeddings": True}, # Set `True` for cosine similarity
|
537 |
+
)
|
538 |
elif model_type == 'OpenAI':
|
539 |
return OpenAIEmbeddings(model=model_path)
|
540 |
elif model_type == 'Cohere':
|
|
|
566 |
phonetic_sim = phonetic_match(doc_text, query)
|
567 |
combined_sim = (1 - phonetic_weight) * embedding_sim + phonetic_weight * phonetic_sim
|
568 |
return combined_sim
|
569 |
+
|
570 |
def _create_vector_store(vector_store_type, chunks_tuple, embedding_model):
|
571 |
chunks = list(chunks_tuple)
|
572 |
+
|
573 |
if vector_store_type == 'FAISS':
|
574 |
return FAISS.from_texts(chunks, embedding_model)
|
575 |
elif vector_store_type == 'Chroma':
|
|
|
587 |
for file in os.listdir(FILES_DIR):
|
588 |
file_path = os.path.join(FILES_DIR, file)
|
589 |
text += FileHandler.extract_text(file_path)
|
590 |
+
|
591 |
if custom_tokenizer_file:
|
592 |
tokenizer = create_custom_tokenizer(custom_tokenizer_file, custom_tokenizer_model, custom_tokenizer_vocab_size, custom_tokenizer_special_tokens)
|
593 |
text = ' '.join(custom_tokenize(text, tokenizer))
|
|
|
603 |
|
604 |
def search_embeddings(chunks, embedding_model, vector_store_type, search_type, query, top_k, expected_result=None, lang='german', apply_phonetic=False, phonetic_weight=0.3):
|
605 |
preprocessed_query = preprocess_text(query, lang) if apply_phonetic else query
|
606 |
+
|
607 |
vector_store = get_vector_store(vector_store_type, chunks, embedding_model)
|
608 |
retriever = get_retriever(vector_store, search_type, {"k": top_k})
|
609 |
|
|
|
613 |
#this should be optional
|
614 |
def score_result(doc):
|
615 |
base_score = vector_store.similarity_search_with_score(doc.page_content, k=1)[0][1]
|
616 |
+
|
617 |
# Add bonus for containing expected result
|
618 |
expected_bonus = 0.3 if expected_result and expected_result in doc.page_content else 0
|
619 |
+
|
620 |
if apply_phonetic:
|
621 |
phonetic_score = phonetic_match(doc.page_content, query)
|
622 |
return (1 - phonetic_weight) * base_score + phonetic_weight * phonetic_score + expected_bonus
|
|
|
645 |
# Enhanced Result Analysis
|
646 |
class ResultAnalyzer:
|
647 |
@staticmethod
|
648 |
+
def calculate_statistics(results, search_time, vector_store, num_tokens, embedding_model, query,
|
649 |
top_k, expected_result=None, model_feedback=None):
|
650 |
stats = {
|
651 |
"num_results": len(results),
|
|
|
657 |
"embedding_dimension": len(embedding_model.embed_query(query)),
|
658 |
"top_k": top_k,
|
659 |
}
|
660 |
+
|
661 |
# Add vector store statistics
|
662 |
try:
|
663 |
if hasattr(vector_store, '_index'):
|
|
|
666 |
stats["vector_store_size"] = len(vector_store._collection.get())
|
667 |
except:
|
668 |
stats["vector_store_size"] = "N/A"
|
669 |
+
|
670 |
# Add expected result statistics if provided
|
671 |
if expected_result:
|
672 |
stats["contains_expected"] = any(expected_result in doc.page_content for doc in results)
|
673 |
+
stats["expected_result_rank"] = next((i for i, doc in enumerate(results)
|
674 |
if expected_result in doc.page_content), -1) + 1
|
675 |
+
|
676 |
# Calculate diversity metrics for larger result sets
|
677 |
if len(results) > 3: # Changed from 1000 to make it more practical
|
678 |
embeddings = [embedding_model.embed_query(doc.page_content) for doc in results]
|
|
|
681 |
else:
|
682 |
stats["result_diversity"] = "N/A"
|
683 |
stats["silhouette_score"] = "N/A"
|
684 |
+
|
685 |
# Add ranking correlation
|
686 |
query_embedding = embedding_model.embed_query(query)
|
687 |
result_embeddings = [embedding_model.embed_query(doc.page_content) for doc in results]
|
|
|
691 |
stats["rank_correlation"] = rank_correlation
|
692 |
else:
|
693 |
stats["rank_correlation"] = "N/A"
|
694 |
+
|
695 |
# Add model feedback if provided
|
696 |
if model_feedback:
|
697 |
stats["model_feedback"] = model_feedback
|
698 |
+
|
699 |
return stats
|
700 |
+
|
701 |
@staticmethod
|
702 |
def _calculate_diversity(embeddings: List[np.ndarray]) -> float:
|
703 |
"""Calculate diversity score for embeddings"""
|
704 |
embeddings_array = np.array(embeddings)
|
705 |
pairwise_similarities = np.inner(embeddings_array, embeddings_array)
|
706 |
return 1 - np.mean(pairwise_similarities[np.triu_indices(len(embeddings), k=1)])
|
707 |
+
|
708 |
@staticmethod
|
709 |
def _calculate_silhouette(embeddings: List[np.ndarray]) -> float:
|
710 |
"""Calculate silhouette score for embeddings"""
|
|
|
724 |
# Add model column if not present
|
725 |
if 'model' not in stats_df.columns:
|
726 |
stats_df['model'] = stats_df['model_type'] + ' - ' + stats_df['model_name']
|
727 |
+
|
728 |
fig, axs = plt.subplots(2, 2, figsize=(20, 20))
|
729 |
+
|
730 |
# Handle empty dataframe case
|
731 |
if len(stats_df) == 0:
|
732 |
return fig
|
733 |
+
|
734 |
# Create plots with error handling
|
735 |
try:
|
736 |
sns.barplot(data=stats_df, x='model', y='search_time', ax=axs[0, 0])
|
|
|
738 |
axs[0, 0].tick_params(axis='x', rotation=45)
|
739 |
except Exception as e:
|
740 |
print(f"Error in search time plot: {e}")
|
741 |
+
|
742 |
try:
|
743 |
+
sns.scatterplot(data=stats_df, x='result_diversity', y='rank_correlation',
|
744 |
hue='model', ax=axs[0, 1])
|
745 |
axs[0, 1].set_title('Result Diversity vs. Rank Correlation')
|
746 |
except Exception as e:
|
747 |
print(f"Error in diversity plot: {e}")
|
748 |
+
|
749 |
try:
|
750 |
sns.boxplot(data=stats_df, x='model', y='avg_content_length', ax=axs[1, 0])
|
751 |
axs[1, 0].set_title('Distribution of Result Content Lengths')
|
752 |
axs[1, 0].tick_params(axis='x', rotation=45)
|
753 |
except Exception as e:
|
754 |
print(f"Error in content length plot: {e}")
|
755 |
+
|
756 |
try:
|
757 |
valid_embeddings = results_df['embedding'].dropna().values
|
758 |
if len(valid_embeddings) > 1:
|
759 |
tsne = TSNE(n_components=2, random_state=42)
|
760 |
embeddings_2d = tsne.fit_transform(np.vstack(valid_embeddings))
|
761 |
+
sns.scatterplot(x=embeddings_2d[:, 0], y=embeddings_2d[:, 1],
|
762 |
+
hue=results_df['Model'][:len(valid_embeddings)],
|
763 |
ax=axs[1, 1])
|
764 |
axs[1, 1].set_title('t-SNE Visualization of Result Embeddings')
|
765 |
else:
|
766 |
+
axs[1, 1].text(0.5, 0.5, "Not enough embeddings for visualization",
|
767 |
ha='center', va='center')
|
768 |
except Exception as e:
|
769 |
print(f"Error in embedding visualization: {e}")
|
770 |
+
|
771 |
plt.tight_layout()
|
772 |
return fig
|
773 |
|
|
|
778 |
#plt.title("Distribution of document lengths in the knowledge base (in count of tokens)")
|
779 |
#plt.show()
|
780 |
|
781 |
+
|
782 |
def optimize_vocabulary(texts, vocab_size=10000, min_frequency=2):
|
783 |
tokenizer = Tokenizer(models.BPE(unk_token="[UNK]"))
|
784 |
|
785 |
word_freq = Counter(word for text in texts for word in text.split())
|
786 |
+
|
787 |
optimized_texts = [
|
788 |
' '.join(word for word in text.split() if word_freq[word] >= min_frequency)
|
789 |
for text in texts
|
790 |
]
|
791 |
+
|
792 |
trainer = trainers.BpeTrainer(vocab_size=vocab_size, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
|
793 |
tokenizer.train_from_iterator(optimized_texts, trainer)
|
794 |
+
|
795 |
return tokenizer, optimized_texts
|
796 |
+
|
797 |
import numpy as np
|
798 |
from transformers import TextClassificationPipeline
|
799 |
from typing import List, Union, Any
|
800 |
|
801 |
+
|
802 |
|
803 |
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
|
804 |
|
805 |
|
806 |
def rerank_results(
|
807 |
+
results: List[Any],
|
808 |
+
query: str,
|
809 |
reranker: Union[TextClassificationPipeline, Any]
|
810 |
) -> List[Any]:
|
811 |
"""
|
812 |
+
|
813 |
"""
|
814 |
if not results:
|
815 |
return results
|
816 |
+
|
817 |
# Step 1: Encode the query and documents using SentenceTransformer
|
818 |
query_embedding = model.encode(query, convert_to_tensor=True)
|
819 |
doc_contents = [doc.page_content for doc in results] # Assuming each result has a `page_content` attribute
|
820 |
doc_embeddings = model.encode(doc_contents, convert_to_tensor=True)
|
821 |
+
|
822 |
# Step 2: Compute cosine similarities between query and document embeddings
|
823 |
cosine_scores = util.cos_sim(query_embedding, doc_embeddings)[0] # Shape: (number of documents,)
|
824 |
+
|
825 |
# Step 3: Sort documents by similarity score in descending order
|
826 |
+
reranked_idx = np.argsort(cosine_scores.cpu().numpy())[::-1]
|
827 |
+
|
828 |
# Step 4: Return the reranked documents
|
829 |
reranked_results = [results[i] for i in reranked_idx]
|
830 |
+
|
831 |
return reranked_results
|
832 |
|
833 |
|
|
|
878 |
if optimize_vocab:
|
879 |
tokenizer, optimized_chunks = optimize_vocabulary(chunks)
|
880 |
chunks = optimized_chunks
|
881 |
+
|
882 |
search_query = query
|
883 |
+
|
884 |
if use_query_optimization:
|
885 |
optimized_queries = optimize_query(query, query_optimization_model, chunks, embedding_model, vector_store_type, search_type, top_k)
|
886 |
#query = " ".join(optimized_queries)
|
887 |
+
search_query = optimized_queries # " ".join([doc.page_content for doc in optimized_queries]) # Extract text from Document objects
|
888 |
|
889 |
results, search_time, vector_store, results_raw = search_embeddings(
|
890 |
chunks,
|
|
|
897 |
lang,
|
898 |
apply_phonetic,
|
899 |
phonetic_weight
|
900 |
+
)
|
901 |
+
|
902 |
if use_reranking:
|
903 |
reranker = pipeline("text-classification", model="cross-encoder/ms-marco-MiniLM-L-12-v2")
|
904 |
results_raw = rerank_results(results_raw, query, reranker)
|
|
|
953 |
def automated_testing(file, query, test_params, expected_result=None):
|
954 |
all_results = []
|
955 |
all_stats = []
|
956 |
+
|
957 |
param_grid = ParameterGrid(test_params)
|
958 |
print(param_grid)
|
959 |
for params in tqdm(param_grid, desc="Running tests"):
|
|
|
995 |
params['apply_phonetic'],
|
996 |
params['phonetic_weight']
|
997 |
)
|
998 |
+
|
999 |
if params['use_reranking']:
|
1000 |
reranker = pipeline("text-classification", model="cross-encoder/ms-marco-MiniLM-L-12-v2")
|
1001 |
results_raw = rerank_results(results_raw, query, reranker)
|
|
|
1022 |
'contains_expected': 0.5, # High weight for containing the expected result
|
1023 |
'expected_result_rank': -0.4 # Lower rank (closer to 1) is better
|
1024 |
}
|
1025 |
+
if stats_df.empty:
|
1026 |
+
print("stats_df is empty. Cannot compute best configuration.")
|
1027 |
+
return None
|
1028 |
+
|
1029 |
for metric in metric_weights.keys():
|
1030 |
+
|
1031 |
+
if metric in stats_df.columns:
|
1032 |
+
stats_df[metric] = pd.to_numeric(stats_df[metric], errors='coerce')
|
1033 |
+
else:
|
1034 |
+
stats_df[metric] = 0
|
1035 |
+
print("Column 'search_time' is missing in stats_df.")
|
1036 |
+
|
1037 |
+
|
1038 |
+
|
1039 |
stats_df['weighted_score'] = sum(
|
1040 |
+
stats_df[metric].fillna(0) * weight
|
1041 |
for metric, weight in metric_weights.items()
|
1042 |
)
|
1043 |
+
|
1044 |
best_config = stats_df.loc[stats_df['weighted_score'].idxmax()]
|
1045 |
+
|
1046 |
recommendations = {
|
1047 |
'best_model': f"{best_config['model_type']} - {best_config['model_name']}",
|
1048 |
'best_settings': {
|
|
|
1069 |
'expected_result_rank': int(best_config['expected_result_rank'])
|
1070 |
}
|
1071 |
}
|
1072 |
+
|
1073 |
return recommendations
|
1074 |
|
1075 |
####
|
|
|
1079 |
return {"error": "No file uploaded"}
|
1080 |
|
1081 |
chunks, _, _ = process_files(
|
1082 |
+
file.name,
|
1083 |
+
'HuggingFace',
|
1084 |
+
'paraphrase-miniLM',
|
1085 |
+
'recursive',
|
1086 |
+
250,
|
1087 |
50,
|
1088 |
custom_separators=None
|
1089 |
)
|
1090 |
+
|
1091 |
# Select a few random chunks
|
1092 |
sample_chunks = random.sample(chunks, min(num_chunks, len(chunks)))
|
1093 |
+
|
1094 |
+
|
1095 |
+
llm_pipeline = pipeline(model="meta-llama/Llama-3.2-1B-Instruct", device='cuda')
|
1096 |
+
|
1097 |
+
|
1098 |
+
prompt=f'''
|
1099 |
+
<|start_header_id|>system<|end_header_id|>
|
1100 |
+
You are an expert in information retrieval.
|
1101 |
+
You know about strenghs and weaknesses of all models.
|
1102 |
+
|
1103 |
+
Given the following text chunks from a document,
|
1104 |
+
suggest optimal settings for an embedding-based search system. The settings should include:
|
1105 |
+
|
1106 |
+
1. Embedding model type and name
|
1107 |
+
2. Split strategy (token or recursive)
|
1108 |
+
3. Chunk size
|
1109 |
+
4. Overlap size
|
1110 |
+
5. Vector store type (FAISS or Chroma)
|
1111 |
+
6. Search type (similarity, mmr, or custom)
|
1112 |
+
7. Top K results to retrieve
|
1113 |
+
8. Whether to apply preprocessing
|
1114 |
+
9. Whether to optimize vocabulary
|
1115 |
+
10. Whether to apply phonetic matching
|
1116 |
+
|
1117 |
+
Expected output format:
|
1118 |
+
{{
|
1119 |
+
"embedding_models": "embedding_model_type:embedding_model_name",
|
1120 |
+
"split_strategy": "token or recursive",
|
1121 |
+
"chunk_size": 250,
|
1122 |
+
"overlap_size": 50,
|
1123 |
+
"vector_store_type": "FAISS or Chroma",
|
1124 |
+
"search_type": "similarity, mmr, or custom",
|
1125 |
+
"top_k": 5,
|
1126 |
+
"apply_preprocessing": True,
|
1127 |
+
"optimize_vocab": True,
|
1128 |
+
"apply_phonetic": False,
|
1129 |
+
"phonetic_weight": 0.3 #
|
1130 |
+
}}
|
1131 |
+
|
1132 |
+
Provide your suggestions in a Python dictionary format.
|
1133 |
+
|
1134 |
+
show me settings You SHOULD NOT include any other text in the response.
|
1135 |
+
Fill out the seeting and chose usefull values.
|
1136 |
+
Respect the users use cases and content snipet. Choose the setting based on the chunks
|
1137 |
+
|
1138 |
+
<|eot_id|><|start_header_id|>user<|end_header_id|>
|
1139 |
+
User user case:
|
1140 |
+
{"small local", "large total context", ...}
|
1141 |
+
|
1142 |
+
total content lenght:
|
1143 |
+
{len(' '.join(chunks))}
|
1144 |
+
|
1145 |
+
Content snipet:
|
1146 |
+
{' '.join(sample_chunks)}
|
1147 |
+
<|eot_id|><|start_header_id|>assistant<|end_header_id|>
|
1148 |
+
'''
|
1149 |
+
suggested_settings = llm_pipeline(
|
1150 |
+
prompt,
|
1151 |
+
do_sample=True,
|
1152 |
+
top_k=10,
|
1153 |
+
num_return_sequences=1,
|
1154 |
+
return_full_text=False,
|
1155 |
+
max_new_tokens=1900, # Control the length of the output,
|
1156 |
+
truncation=True, # Enable truncation
|
1157 |
+
)
|
1158 |
+
|
1159 |
+
|
1160 |
+
#suggested_settings = llm.invoke(prompt)
|
1161 |
print("setting suggested")
|
1162 |
print(suggested_settings)
|
1163 |
# Parse the generated text to extract the dictionary
|
|
|
1183 |
def update_inputs_with_llm_suggestions(suggestions):
|
1184 |
if suggestions is None or "error" in suggestions:
|
1185 |
return [gr.update() for _ in range(11)] # Return no updates if there's an error or None
|
1186 |
+
|
1187 |
return [
|
1188 |
gr.update(value=[suggestions["embedding_models"]]), # embedding_models_input
|
1189 |
gr.update(value=suggestions["split_strategy"]), # split_strategy_input
|
|
|
1201 |
def parse_model_selections(default_models, custom_models):
|
1202 |
"""
|
1203 |
Parse selected default models and custom models into model configurations
|
1204 |
+
|
1205 |
Args:
|
1206 |
default_models (List[str]): Selected default models in format "type:name"
|
1207 |
custom_models (str): Custom models string with one model per line in format "type:name"
|
1208 |
+
|
1209 |
Returns:
|
1210 |
List[Dict[str, str]]: List of model configurations with 'type' and 'name' keys
|
1211 |
"""
|
1212 |
model_configs = []
|
1213 |
+
|
1214 |
# Process default models
|
1215 |
if default_models:
|
1216 |
for model in default_models:
|
|
|
1219 |
'type': model_type,
|
1220 |
'name': model_name
|
1221 |
})
|
1222 |
+
|
1223 |
# Process custom models
|
1224 |
if custom_models:
|
1225 |
custom_model_lines = custom_models.strip().split('\n')
|
|
|
1230 |
'type': model_type.strip(),
|
1231 |
'name': model_name.strip()
|
1232 |
})
|
1233 |
+
|
1234 |
return model_configs
|
1235 |
|
1236 |
def parse_comma_separated(text):
|
|
|
1240 |
return [x.strip() for x in text.split(',') if x.strip()]
|
1241 |
|
1242 |
|
1243 |
+
|
1244 |
# Gradio Interface
|
1245 |
def launch_interface(debug=True):
|
1246 |
with gr.Blocks() as iface:
|
1247 |
gr.Markdown("# Advanced Embedding Comparison Tool")
|
1248 |
+
|
1249 |
with gr.Tab("Simple"):
|
1250 |
file_input = gr.File(label="Upload File (Optional)")
|
1251 |
query_input = gr.Textbox(label="Search Query")
|
|
|
1260 |
label="Embedding Models"
|
1261 |
)
|
1262 |
top_k_input = gr.Slider(1, 10, step=1, value=5, label="Top K")
|
1263 |
+
|
1264 |
with gr.Tab("Advanced"):
|
1265 |
custom_embedding_model_input = gr.Textbox(label="Custom Embedding Model (optional, format: type:name)")
|
1266 |
split_strategy_input = gr.Radio(choices=["token", "recursive"], label="Split Strategy", value="recursive")
|
|
|
1270 |
vector_store_type_input = gr.Radio(choices=["FAISS", "Chroma"], label="Vector Store Type", value="FAISS")
|
1271 |
search_type_input = gr.Radio(choices=["similarity", "mmr", "custom"], label="Search Type", value="similarity")
|
1272 |
lang_input = gr.Dropdown(choices=["german", "english", "french"], label="Language", value="german")
|
1273 |
+
|
1274 |
with gr.Tab("Expert"):
|
1275 |
apply_preprocessing_input = gr.Checkbox(label="Apply Text Preprocessing", value=False)
|
1276 |
optimize_vocab_input = gr.Checkbox(label="Optimize Vocabulary", value=False)
|
|
|
1288 |
with gr.Row():
|
1289 |
auto_file_input = gr.File(label="Upload File (Optional)")
|
1290 |
auto_query_input = gr.Textbox(label="Search Query")
|
1291 |
+
|
1292 |
with gr.Row():
|
1293 |
auto_expected_result_input = gr.Textbox(
|
1294 |
label="Expected Result (Optional)",
|
|
|
1298 |
label="Model Feedback (Optional)",
|
1299 |
placeholder="Enter any feedback about model performance"
|
1300 |
)
|
1301 |
+
|
1302 |
with gr.Row():
|
1303 |
with gr.Column():
|
1304 |
# Default model selection
|
1305 |
default_models_input = gr.CheckboxGroup(
|
1306 |
+
choices=[f"{type}:{name}"
|
1307 |
+
for type, names in DEFAULT_MODELS.items()
|
1308 |
for name in names],
|
1309 |
label="Default Models",
|
1310 |
value=[f"HuggingFace:{DEFAULT_MODELS['HuggingFace'][0]}"]
|
1311 |
)
|
1312 |
+
|
1313 |
with gr.Column():
|
1314 |
# Custom model input
|
1315 |
custom_models_input = gr.TextArea(
|
|
|
1317 |
placeholder="Enter one model per line in format: type:name",
|
1318 |
lines=3
|
1319 |
)
|
1320 |
+
|
1321 |
auto_split_strategies = gr.CheckboxGroup(
|
1322 |
choices=["token", "recursive"],
|
1323 |
label="Split Strategies to Test"
|
|
|
1336 |
auto_optimize_vocab = gr.Checkbox(label="Test Vocabulary Optimization", value=True)
|
1337 |
auto_use_query_optimization = gr.Checkbox(label="Test Query Optimization", value=True)
|
1338 |
auto_use_reranking = gr.Checkbox(label="Test Reranking", value=True)
|
1339 |
+
|
1340 |
auto_results_output = gr.Dataframe(label="Automated Test Results", interactive=False)
|
1341 |
auto_stats_output = gr.Dataframe(label="Automated Test Statistics", interactive=False)
|
1342 |
recommendations_output = gr.JSON(label="Recommendations")
|
1343 |
+
|
1344 |
def run_automation(file_input, query_input, expected_result, default_models, custom_models,
|
1345 |
split_strategies, chunk_sizes, overlap_sizes,
|
1346 |
vector_store_types, search_types, top_k_values,
|
1347 |
optimize_vocab, use_query_optimization, use_reranking,
|
1348 |
model_feedback):
|
1349 |
"""Wrapper function to handle Gradio inputs and run automated tests"""
|
1350 |
+
|
1351 |
# Parse model configurations
|
1352 |
model_configs = parse_model_selections(default_models, custom_models)
|
1353 |
+
|
1354 |
# Parse test parameters
|
1355 |
test_params = {
|
1356 |
'split_strategy': split_strategies,
|
|
|
1369 |
'custom_separators': [None],
|
1370 |
'query_optimization_model': ['google/flan-t5-base'] # Default query optimization model
|
1371 |
}
|
1372 |
+
|
1373 |
# Run automated tests
|
1374 |
results_df, stats_df = run_automated_tests(
|
1375 |
file_input.name if file_input else None,
|
|
|
1379 |
expected_result if expected_result else None,
|
1380 |
model_feedback if model_feedback else None
|
1381 |
)
|
1382 |
+
|
1383 |
# Generate recommendations based on results
|
1384 |
recommendations = analyze_results(stats_df)
|
1385 |
+
|
1386 |
return results_df, stats_df, recommendations
|
1387 |
+
|
1388 |
auto_submit_button = gr.Button("Run Automated Tests")
|
1389 |
auto_submit_button.click(
|
1390 |
fn=run_automation,
|
|
|
1399 |
outputs=[auto_results_output, auto_stats_output, recommendations_output]
|
1400 |
)
|
1401 |
###
|
1402 |
+
|
1403 |
with gr.Tab("Results"):
|
1404 |
with gr.Row():
|
1405 |
results_output = gr.DataFrame(label="Results")
|
1406 |
stats_output = gr.DataFrame(label="Statistics")
|
1407 |
+
|
1408 |
with gr.Row():
|
1409 |
plot_output = gr.Plot(label="Visualizations")
|
1410 |
model_rankings_output = gr.JSON(label="Model Rankings")
|
1411 |
+
|
1412 |
with gr.Row():
|
1413 |
recommendations_output = gr.JSON(label="Recommendations")
|
1414 |
+
|
1415 |
with gr.Tab("LLM Suggestions"):
|
1416 |
llm_file_input = gr.File(label="Upload File for LLM Suggestions")
|
1417 |
llm_num_chunks = gr.Slider(1, 10, step=1, value=5, label="Number of Sample Chunks")
|
1418 |
llm_suggest_button = gr.Button("Get LLM Suggestions")
|
1419 |
llm_suggestions_output = gr.JSON(label="LLM-suggested Settings")
|
1420 |
+
|
1421 |
llm_suggest_button.click(
|
1422 |
fn=get_llm_suggested_settings,
|
1423 |
inputs=[llm_file_input, llm_num_chunks],
|
|
|
1426 |
fn=update_inputs_with_llm_suggestions,
|
1427 |
inputs=[llm_suggestions_output],
|
1428 |
outputs=[
|
1429 |
+
embedding_models_input, split_strategy_input, chunk_size_input,
|
1430 |
+
overlap_size_input, vector_store_type_input, search_type_input,
|
1431 |
+
top_k_input, apply_preprocessing_input, optimize_vocab_input,
|
1432 |
apply_phonetic_input, phonetic_weight_input
|
1433 |
]
|
1434 |
)
|
|
|
1549 |
User: "Was sind die Hauptziele des KI-Gesetzes?"
|
1550 |
"""
|
1551 |
|
1552 |
+
|
1553 |
tutorial_md = """
|
1554 |
# Advanced Embedding Comparison Tool Tutorial
|
1555 |
|
|
|
1698 |
def create_custom_tokenizer(file_path, model_type='WordLevel', vocab_size=10000, special_tokens=None):
|
1699 |
with open(file_path, 'r', encoding='utf-8') as f:
|
1700 |
text = f.read()
|
1701 |
+
|
1702 |
tokenizer = Tokenizer(models.WordLevel(unk_token="[UNK]")) if model_type == 'WordLevel' else Tokenizer(models.BPE(unk_token="[UNK]"))
|
1703 |
tokenizer.pre_tokenizer = Whitespace()
|
1704 |
+
|
1705 |
trainer = trainers.WordLevelTrainer(special_tokens=special_tokens or ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"], vocab_size=vocab_size)
|
1706 |
tokenizer.train_from_iterator([text], trainer)
|
1707 |
+
|
1708 |
return tokenizer
|
1709 |
````
|
1710 |
|
|
|
1736 |
|
1737 |
|
1738 |
## Useful Resources and Links
|
1739 |
+
|
1740 |
Here are some valuable resources to help you better understand and work with embeddings, retrieval systems, and natural language processing:
|
1741 |
+
|
1742 |
### Embeddings and Vector Databases
|
1743 |
- [Understanding Embeddings](https://www.tensorflow.org/text/guide/word_embeddings): A guide by TensorFlow on word embeddings
|
1744 |
- [FAISS: A Library for Efficient Similarity Search](https://github.com/facebookresearch/faiss): Facebook AI's vector similarity search library
|
1745 |
- [Chroma: The AI-native open-source embedding database](https://www.trychroma.com/): An embedding database designed for AI applications
|
1746 |
+
|
1747 |
### Natural Language Processing
|
1748 |
- [NLTK (Natural Language Toolkit)](https://www.nltk.org/): A leading platform for building Python programs to work with human language data
|
1749 |
- [spaCy](https://spacy.io/): Industrial-strength Natural Language Processing in Python
|
1750 |
- [Hugging Face Transformers](https://huggingface.co/transformers/): State-of-the-art Natural Language Processing for PyTorch and TensorFlow 2.0
|
1751 |
+
|
1752 |
### Retrieval-Augmented Generation (RAG)
|
1753 |
- [LangChain](https://python.langchain.com/docs/get_started/introduction): A framework for developing applications powered by language models
|
1754 |
- [OpenAI's RAG Tutorial](https://platform.openai.com/docs/tutorials/web-qa-embeddings): A guide on building a QA system with embeddings
|
1755 |
+
|
1756 |
### German Language Processing
|
1757 |
- [Kölner Phonetik](https://en.wikipedia.org/wiki/Cologne_phonetics): Information about the Kölner Phonetik algorithm
|
1758 |
- [German NLP Resources](https://github.com/adbar/German-NLP): A curated list of open-access resources for German NLP
|
1759 |
+
|
1760 |
### Benchmarks and Evaluation
|
1761 |
- [MTEB Leaderboard](https://huggingface.co/spaces/mteb/leaderboard): Massive Text Embedding Benchmark leaderboard
|
1762 |
- [GLUE Benchmark](https://gluebenchmark.com/): General Language Understanding Evaluation benchmark
|
1763 |
+
|
1764 |
### Tools and Libraries
|
1765 |
- [Gensim](https://radimrehurek.com/gensim/): Topic modelling for humans
|
1766 |
- [Sentence-Transformers](https://www.sbert.net/): A Python framework for state-of-the-art sentence, text and image embeddings
|
1767 |
+
|
1768 |
### Support me
|
1769 |
- [Visual Crew Builder](https://visual-crew.builder.ai/): Tool for create AI systems, workflows and api. Or just a notebook.
|
1770 |
+
|
1771 |
+
|
1772 |
|
1773 |
This tool empowers you to fine-tune your RAG system for optimal performance. Experiment with different settings, run automated tests, and use insights to create an efficient information retrieval and generation system.
|
1774 |
|
|
|
1791 |
settings['lang'],
|
1792 |
settings['apply_preprocessing']
|
1793 |
)
|
1794 |
+
|
1795 |
results, _, _, _ = search_embeddings(
|
1796 |
chunks,
|
1797 |
embedding_model,
|
|
|
1803 |
apply_phonetic=settings['apply_phonetic'],
|
1804 |
phonetic_weight=settings['phonetic_weight']
|
1805 |
)
|
1806 |
+
|
1807 |
# Generate a response based on the retrieved results
|
1808 |
response = f"Based on the query '{message}', here are the top {settings['top_k']} relevant results:\n\n"
|
1809 |
for i, result in enumerate(results[:settings['top_k']]):
|
1810 |
response += f"{i+1}. {result['content'][:100]}...\n\n"
|
1811 |
+
|
1812 |
return response
|
1813 |
|
1814 |
with gr.Blocks() as chat_interface:
|
|
|
1846 |
launch_interface()
|
1847 |
# Uncomment the following line to launch the sample chat app
|
1848 |
´´´
|
1849 |
+
|
1850 |
"""
|
1851 |
|
1852 |
|
|
|
1855 |
["Embedding Comparison", "Tutorial", "Use Case"]
|
1856 |
)
|
1857 |
|
1858 |
+
iface.launch(debug=True, share=True)
|
1859 |
|
1860 |
# Enhanced Automated Testing
|
1861 |
+
def run_automated_tests(file_path: str, query: str, model_configs: List[Dict[str, str]],
|
1862 |
test_params: Dict[str, List[Any]], expected_result: Optional[str] = None,
|
1863 |
model_feedback: Optional[str] = None) -> Tuple[pd.DataFrame, pd.DataFrame]:
|
1864 |
"""
|
|
|
1867 |
all_results = []
|
1868 |
all_stats = []
|
1869 |
model_manager = ModelManager()
|
1870 |
+
|
1871 |
# Create parameter grid excluding model configurations
|
1872 |
base_params = {k: v for k, v in test_params.items() if k not in ['model_type', 'model_name']}
|
1873 |
param_grid = ParameterGrid(base_params)
|
1874 |
+
|
1875 |
# Test each model configuration with all parameter combinations
|
1876 |
for model_config in tqdm(model_configs, desc="Testing models"):
|
1877 |
model_type = model_config['type']
|
1878 |
model_name = model_config['name']
|
1879 |
+
|
1880 |
for params in tqdm(param_grid, desc=f"Testing parameters for {model_type}:{model_name}"):
|
1881 |
try:
|
1882 |
# Process files and get chunks
|
|
|
1891 |
params['lang'],
|
1892 |
params['apply_preprocessing']
|
1893 |
)
|
1894 |
+
|
1895 |
# Apply vocabulary optimization if specified
|
1896 |
if params['optimize_vocab']:
|
1897 |
tokenizer, chunks = optimize_vocabulary(chunks)
|
1898 |
+
|
1899 |
# Apply query optimization if specified
|
1900 |
current_query = query
|
1901 |
if params['use_query_optimization']:
|
|
|
1909 |
params['top_k']
|
1910 |
)
|
1911 |
current_query = " ".join(optimized_queries)
|
1912 |
+
|
1913 |
# Perform search
|
1914 |
results, search_time, vector_store, raw_results = search_embeddings(
|
1915 |
chunks,
|
|
|
1923 |
params['apply_phonetic'],
|
1924 |
params['phonetic_weight']
|
1925 |
)
|
1926 |
+
|
1927 |
# Apply reranking if specified
|
1928 |
if params['use_reranking']:
|
1929 |
+
reranker = pipeline("text-classification",
|
1930 |
model="cross-encoder/ms-marco-MiniLM-L-12-v2")
|
1931 |
raw_results = rerank_results(raw_results, current_query, reranker)
|
1932 |
+
|
1933 |
# Calculate statistics
|
1934 |
stats = ResultAnalyzer.calculate_statistics(
|
1935 |
raw_results, search_time, vector_store, num_tokens,
|
1936 |
embedding_model, current_query, params['top_k'],
|
1937 |
expected_result, model_feedback
|
1938 |
)
|
1939 |
+
|
1940 |
# Update model rankings
|
1941 |
model_id = f"{model_type}:{model_name}"
|
1942 |
ranking_score = calculate_model_ranking_score(stats)
|
1943 |
model_manager.update_model_ranking(model_id, ranking_score, model_feedback)
|
1944 |
+
|
1945 |
# Add model information to stats
|
1946 |
stats.update({
|
1947 |
"model_type": model_type,
|
|
|
1949 |
"model": f"{model_type} - {model_name}",
|
1950 |
**params
|
1951 |
})
|
1952 |
+
|
1953 |
# Format and store results
|
1954 |
all_results.extend(format_results(raw_results, stats))
|
1955 |
all_stats.append(stats)
|
1956 |
+
|
1957 |
except Exception as e:
|
1958 |
print(f"Error testing {model_type}:{model_name} with parameters {params}: {str(e)}")
|
1959 |
continue
|
1960 |
+
|
1961 |
return pd.DataFrame(all_results), pd.DataFrame(all_stats)
|
1962 |
|
1963 |
# Helper function to calculate model ranking score
|
|
|
1970 |
'contains_expected': 0.3,
|
1971 |
'expected_result_rank': -0.2 # Negative weight because lower rank is better
|
1972 |
}
|
1973 |
+
|
1974 |
score = 0.0
|
1975 |
for metric, weight in weights.items():
|
1976 |
if metric in stats and not isinstance(stats[metric], str):
|
|
|
1981 |
else:
|
1982 |
value = float(stats[metric])
|
1983 |
score += weight * value
|
1984 |
+
|
1985 |
return score
|
1986 |
|
1987 |
if __name__ == "__main__":
|
1988 |
launch_interface()
|
|