Spaces:
No application file
No application file
import pandas as pd | |
import numpy as np | |
import os | |
import pickle | |
from sklearn.metrics.pairwise import cosine_similarity | |
from sentence_transformers import SentenceTransformer | |
import nltk | |
from nltk.tokenize import word_tokenize | |
class QuestionSimilarityModel: | |
def __init__(self, dataset_path, cache_path='embeddings_cache.pkl'): | |
self.dataset_path = dataset_path | |
self.cache_path = cache_path | |
self.dataset = pd.read_csv(dataset_path) | |
self.model = SentenceTransformer('all-MiniLM-L6-v2') | |
self.embeddings = self._load_or_generate_embeddings() | |
def _generate_embeddings(self, questions): | |
combined_text = questions.apply(lambda x: f"{x['title']} Difficulty: {x['difficulty']}", axis=1) | |
return self.model.encode(combined_text.tolist(), convert_to_tensor=True) | |
def _load_or_generate_embeddings(self): | |
if os.path.exists(self.cache_path): | |
with open(self.cache_path, 'rb') as f: | |
print("Loading cached embeddings...") | |
return pickle.load(f) | |
else: | |
print("Generating new embeddings...") | |
embeddings = self._generate_embeddings(self.dataset) | |
with open(self.cache_path, 'wb') as f: | |
pickle.dump(embeddings, f) | |
return embeddings | |
def _preprocess(self, text): | |
tokens = word_tokenize(text.lower()) | |
return ' '.join(tokens) | |
def check_similarity(self, new_questions): | |
results = [] | |
for question in new_questions: | |
preprocessed = self._preprocess(question) | |
new_embedding = self.model.encode(preprocessed, convert_to_tensor=True) | |
similarities = cosine_similarity([new_embedding], self.embeddings)[0] | |
max_score = np.max(similarities) | |
max_index = np.argmax(similarities) | |
matched_indices = np.where(similarities >= 0.7)[0] # Threshold for strong match | |
matched_sources = self.dataset.iloc[matched_indices][['title', 'difficulty']].to_dict('records') | |
best_match = self.dataset.iloc[max_index] | |
results.append({ | |
'input_question': question, | |
'relevance_score': float(max_score), | |
'matched_sources': matched_sources, | |
'best_match': { | |
'index': int(max_index), | |
'title': best_match['title'], | |
'difficulty': best_match['difficulty'] | |
} | |
}) | |
return results |