Spaces:
Sleeping
Sleeping
import pandas as pd | |
import numpy as np | |
from sentence_transformers import SentenceTransformer | |
from typing import List, Dict, Tuple | |
import re | |
class CourseSearchSystem: | |
def __init__(self): | |
# Initialize the embedding model | |
self.model = SentenceTransformer('all-MiniLM-L6-v2') | |
self.courses_df = None | |
self.course_embeddings = None | |
def preprocess_text(self, text: str) -> str: | |
"""Clean and standardize text data""" | |
if pd.isna(text): | |
return "" | |
text = str(text) | |
text = re.sub(r'[^\w\s]', ' ', text) | |
text = ' '.join(text.split()) | |
return text.lower() | |
def prepare_course_data(self, df: pd.DataFrame) -> pd.DataFrame: | |
"""Prepare and clean course data""" | |
free_courses = df[df['Course Name'].str.contains('Free', case=False, na=False)] | |
free_courses = free_courses.fillna({ | |
'Course Time': 0, | |
'Ratings': 4.6, | |
'Difficulty': 'Beginner', | |
'Key Takeaways': 'Course details not available.' | |
}) | |
free_courses['search_text'] = free_courses.apply( | |
lambda x: f"{x['Course Name']} {x['Key Takeaways']} {x['Difficulty']}", | |
axis=1 | |
) | |
free_courses['search_text'] = free_courses['search_text'].apply(self.preprocess_text) | |
return free_courses | |
def load_and_prepare_data(self, df: pd.DataFrame): | |
"""Load and prepare the course data and generate embeddings""" | |
self.courses_df = self.prepare_course_data(df) | |
self.course_embeddings = self.model.encode( | |
self.courses_df['search_text'].tolist(), | |
show_progress_bar=True | |
) | |
def generate_response(self, query: str, results: List[Dict]) -> str: | |
"""Generate a natural language response with course recommendations""" | |
response_parts = [] | |
# Introduction | |
response_parts.append(f"I've searched through Analytics Vidhya's free courses related to '{query}' and found some excellent matches. Here are the most relevant courses:") | |
# Course details | |
for i, result in enumerate(results, 1): | |
course_section = f"\n### {i}. {result['course_name']}\n" | |
# Add rating visualization | |
rating = result['ratings'] | |
stars = "⭐" * int(rating) + ("½" if rating % 1 >= 0.5 else "") | |
course_section += f"**Rating:** {stars} ({rating})\n" | |
# Add difficulty and duration | |
course_section += f"**Difficulty:** {result['difficulty']}\n" | |
if result['course_time']: | |
course_section += f"**Duration:** {result['course_time']} hours\n" | |
# Add key takeaways if available | |
if result['key_takeaways'] and result['key_takeaways'] != 'Course details not available.': | |
course_section += "\n**Key Takeaways:**\n" | |
takeaways = result['key_takeaways'].split('.,') | |
for takeaway in takeaways: | |
# Clean up the takeaway text | |
cleaned_takeaway = takeaway.strip('. ,') | |
if cleaned_takeaway: | |
course_section += f"- {cleaned_takeaway}\n" | |
# Add course link | |
course_section += f"\n🔗 [Access the course here]({result['url']})\n" | |
response_parts.append(course_section) | |
# Add conclusion | |
response_parts.append("\nEach of these courses is free and available on the Analytics Vidhya platform. Would you like me to provide more specific details about any of these courses or help you find courses on a different topic?") | |
return "\n".join(response_parts) | |
def search_courses(self, query: str, top_k: int = 5) -> str: | |
"""Search for courses and return formatted response""" | |
# Preprocess query | |
query = self.preprocess_text(query) | |
# Generate query embedding | |
query_embedding = self.model.encode([query])[0] | |
# Calculate similarities | |
similarities = np.dot(self.course_embeddings, query_embedding) | |
# Get top k results | |
top_indices = np.argsort(similarities)[-top_k:][::-1] | |
results = [] | |
for idx in top_indices: | |
course = self.courses_df.iloc[idx] | |
results.append({ | |
'course_name': course['Course Name'], | |
'key_takeaways': course['Key Takeaways'], | |
'course_time': course['Course Time'], | |
'ratings': course['Ratings'], | |
'difficulty': course['Difficulty'], | |
'similarity_score': similarities[idx], | |
'url': course['Website'] | |
}) | |
# Generate formatted response | |
return self.generate_response(query, results) | |
def test_search_system(df: pd.DataFrame): | |
"""Test the search system with sample queries""" | |
search_system = CourseSearchSystem() | |
search_system.load_and_prepare_data(df) | |
test_queries = [ | |
"machine learning for beginners", | |
"natural language processing", | |
"computer vision courses", | |
"data preprocessing tutorials", | |
"generative AI learning" | |
] | |
for query in test_queries: | |
print(f"\nTesting query: '{query}'\n") | |
response = search_system.search_courses(query, top_k=3) | |
print(response) | |
print("\n" + "="*80 + "\n") | |
if __name__ == "__main__": | |
df = pd.read_csv('course_data.csv') | |
test_search_system(df) | |