Spaces:
Sleeping
Sleeping
import pandas as pd | |
import numpy as np | |
from typing import List, Dict, Tuple | |
import re | |
import torch | |
from transformers import AutoModel, AutoTokenizer | |
class CourseSearchSystem: | |
def __init__(self): | |
self.device = 'cuda' if torch.cuda.is_available() else 'cpu' | |
self.model_name = 'sentence-transformers/all-MiniLM-L6-v2' | |
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name) | |
self.model = AutoModel.from_pretrained(self.model_name).to(self.device) | |
self.model.eval() | |
def mean_pooling(self, model_output, attention_mask): | |
"""Mean pooling to get sentence embeddings""" | |
token_embeddings = model_output[0] | |
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() | |
return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9) | |
def get_embeddings(self, texts: List[str]) -> np.ndarray: | |
"""Get embeddings for a list of texts""" | |
encoded_input = self.tokenizer(texts, padding=True, truncation=True, return_tensors='pt', max_length=512) | |
encoded_input = {k: v.to(self.device) for k, v in encoded_input.items()} | |
with torch.no_grad(): | |
model_output = self.model(**encoded_input) | |
sentence_embeddings = self.mean_pooling(model_output, encoded_input['attention_mask']) | |
sentence_embeddings = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1) | |
return sentence_embeddings.cpu().numpy() | |
def preprocess_text(self, text: str) -> str: | |
"""Clean and standardize text data""" | |
if pd.isna(text): | |
return "" | |
text = str(text) | |
text = re.sub(r'[^\w\s]', ' ', text) | |
text = ' '.join(text.split()) | |
return text.lower() | |
def prepare_course_data(self, df: pd.DataFrame) -> pd.DataFrame: | |
"""Prepare and clean course data""" | |
free_courses = df[df['Course Name'].str.contains('Free', case=False, na=False)] | |
free_courses = free_courses.fillna({ | |
'Course Time': 0, | |
'Ratings': 4.6, | |
'Difficulty': 'Beginner', | |
'Key Takeaways': 'Course details not available.' | |
}) | |
free_courses['search_text'] = free_courses.apply( | |
lambda x: f"{x['Course Name']} {x['Key Takeaways']} {x['Difficulty']}", | |
axis=1 | |
) | |
free_courses['search_text'] = free_courses['search_text'].apply(self.preprocess_text) | |
return free_courses | |
def load_and_prepare_data(self, df: pd.DataFrame): | |
"""Load and prepare the course data and generate embeddings""" | |
self.courses_df = self.prepare_course_data(df) | |
self.course_embeddings = self.get_embeddings(self.courses_df['search_text'].tolist()) | |
def generate_response(self, query: str, results: List[Dict]) -> str: | |
"""Generate a professional response with course recommendations""" | |
response_parts = [] | |
# Introduction based on number of results | |
if len(results) == 1: | |
response_parts.append(f"I found an excellent free course matching your search for '{query}':") | |
else: | |
response_parts.append(f"I found {len(results)} relevant free courses matching your search for '{query}':") | |
# Course details | |
for i, result in enumerate(results, 1): | |
course_name = result['course_name'] | |
course_section = f"\n**{i}. {course_name}**\n" | |
# Clean rating display | |
rating = result['ratings'] | |
rating_display = f"{rating}/5.0" | |
course_section += f"**Rating:** {rating_display}\n" | |
# Add difficulty | |
course_section += f"**Level:** {result['difficulty']}\n" | |
# Add duration if available | |
if result['course_time']: | |
course_section += f"**Duration:** {result['course_time']} hours\n" | |
# Format key takeaways with bullet points | |
if result['key_takeaways'] and result['key_takeaways'] != 'Course details not available.': | |
course_section += "\n**What you'll learn:**\n" | |
takeaways = result['key_takeaways'].split('.,') | |
formatted_takeaways = [] | |
for takeaway in takeaways: | |
cleaned = takeaway.strip('. ,') | |
if cleaned: | |
if len(cleaned) > 100: | |
cleaned = cleaned[:97] + "..." | |
formatted_takeaways.append(f"• {cleaned}") | |
course_section += "\n".join(formatted_takeaways[:3]) | |
if len(takeaways) > 3: | |
course_section += "\n• And more..." | |
# Add relevance score as a percentage | |
similarity_percentage = int(result['similarity_score'] * 100) | |
course_section += f"\n**Match Score:** {similarity_percentage}%" | |
# Add course link | |
course_section += f"\n\n[Start Course]({result['url']})\n" | |
response_parts.append(course_section) | |
# Add helpful conclusion | |
response_parts.append("\n---\n") | |
response_parts.append("**Notes:**") | |
response_parts.append("• Courses are sorted by relevance to your search") | |
response_parts.append("• All courses are free and include hands-on projects") | |
response_parts.append("• Certificates are provided upon completion") | |
return "\n".join(response_parts) | |
def search_courses(self, query: str, top_k: int = 5) -> str: | |
"""Search for courses and return formatted response""" | |
query = self.preprocess_text(query) | |
query_embedding = self.get_embeddings([query])[0] | |
similarities = np.dot(self.course_embeddings, query_embedding) | |
top_indices = np.argsort(similarities)[-top_k:][::-1] | |
results = [] | |
for idx in top_indices: | |
course = self.courses_df.iloc[idx] | |
results.append({ | |
'course_name': course['Course Name'], | |
'key_takeaways': course['Key Takeaways'], # Fixed: Changed from Key_Takeaways to Key Takeaways | |
'course_time': course['Course Time'], | |
'ratings': course['Ratings'], | |
'difficulty': course['Difficulty'], | |
'similarity_score': similarities[idx], | |
'url': course['Website'] | |
}) | |
return self.generate_response(query, results) |