tronskel's picture
Upload 46 files
2875866 verified
import pandas as pd
import numpy as np
import os
import pickle
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import nltk
from nltk.tokenize import word_tokenize
class QuestionSimilarityModel:
def __init__(self, dataset_path, cache_path='embeddings_cache.pkl'):
self.dataset_path = dataset_path
self.cache_path = cache_path
self.dataset = pd.read_csv(dataset_path)
self.model = SentenceTransformer('all-MiniLM-L6-v2')
self.embeddings = self._load_or_generate_embeddings()
def _generate_embeddings(self, questions):
combined_text = questions.apply(lambda x: f"{x['title']} Difficulty: {x['difficulty']}", axis=1)
return self.model.encode(combined_text.tolist(), convert_to_tensor=True)
def _load_or_generate_embeddings(self):
if os.path.exists(self.cache_path):
with open(self.cache_path, 'rb') as f:
print("Loading cached embeddings...")
return pickle.load(f)
else:
print("Generating new embeddings...")
embeddings = self._generate_embeddings(self.dataset)
with open(self.cache_path, 'wb') as f:
pickle.dump(embeddings, f)
return embeddings
def _preprocess(self, text):
tokens = word_tokenize(text.lower())
return ' '.join(tokens)
def check_similarity(self, new_questions):
results = []
for question in new_questions:
preprocessed = self._preprocess(question)
new_embedding = self.model.encode(preprocessed, convert_to_tensor=True)
similarities = cosine_similarity([new_embedding], self.embeddings)[0]
max_score = np.max(similarities)
max_index = np.argmax(similarities)
matched_indices = np.where(similarities >= 0.7)[0] # Threshold for strong match
matched_sources = self.dataset.iloc[matched_indices][['title', 'difficulty']].to_dict('records')
best_match = self.dataset.iloc[max_index]
results.append({
'input_question': question,
'relevance_score': float(max_score),
'matched_sources': matched_sources,
'best_match': {
'index': int(max_index),
'title': best_match['title'],
'difficulty': best_match['difficulty']
}
})
return results