analytics_vidhya_search / course-search-system-v2.py
rxhulshxrmx's picture
Upload 5 files
2820d25 verified
raw
history blame
5.66 kB
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from typing import List, Dict, Tuple
import re
class CourseSearchSystem:
def __init__(self):
# Initialize the embedding model
self.model = SentenceTransformer('all-MiniLM-L6-v2')
self.courses_df = None
self.course_embeddings = None
def preprocess_text(self, text: str) -> str:
"""Clean and standardize text data"""
if pd.isna(text):
return ""
text = str(text)
text = re.sub(r'[^\w\s]', ' ', text)
text = ' '.join(text.split())
return text.lower()
def prepare_course_data(self, df: pd.DataFrame) -> pd.DataFrame:
"""Prepare and clean course data"""
free_courses = df[df['Course Name'].str.contains('Free', case=False, na=False)]
free_courses = free_courses.fillna({
'Course Time': 0,
'Ratings': 4.6,
'Difficulty': 'Beginner',
'Key Takeaways': 'Course details not available.'
})
free_courses['search_text'] = free_courses.apply(
lambda x: f"{x['Course Name']} {x['Key Takeaways']} {x['Difficulty']}",
axis=1
)
free_courses['search_text'] = free_courses['search_text'].apply(self.preprocess_text)
return free_courses
def load_and_prepare_data(self, df: pd.DataFrame):
"""Load and prepare the course data and generate embeddings"""
self.courses_df = self.prepare_course_data(df)
self.course_embeddings = self.model.encode(
self.courses_df['search_text'].tolist(),
show_progress_bar=True
)
def generate_response(self, query: str, results: List[Dict]) -> str:
"""Generate a natural language response with course recommendations"""
response_parts = []
# Introduction
response_parts.append(f"I've searched through Analytics Vidhya's free courses related to '{query}' and found some excellent matches. Here are the most relevant courses:")
# Course details
for i, result in enumerate(results, 1):
course_section = f"\n### {i}. {result['course_name']}\n"
# Add rating visualization
rating = result['ratings']
stars = "⭐" * int(rating) + ("½" if rating % 1 >= 0.5 else "")
course_section += f"**Rating:** {stars} ({rating})\n"
# Add difficulty and duration
course_section += f"**Difficulty:** {result['difficulty']}\n"
if result['course_time']:
course_section += f"**Duration:** {result['course_time']} hours\n"
# Add key takeaways if available
if result['key_takeaways'] and result['key_takeaways'] != 'Course details not available.':
course_section += "\n**Key Takeaways:**\n"
takeaways = result['key_takeaways'].split('.,')
for takeaway in takeaways:
# Clean up the takeaway text
cleaned_takeaway = takeaway.strip('. ,')
if cleaned_takeaway:
course_section += f"- {cleaned_takeaway}\n"
# Add course link
course_section += f"\n🔗 [Access the course here]({result['url']})\n"
response_parts.append(course_section)
# Add conclusion
response_parts.append("\nEach of these courses is free and available on the Analytics Vidhya platform. Would you like me to provide more specific details about any of these courses or help you find courses on a different topic?")
return "\n".join(response_parts)
def search_courses(self, query: str, top_k: int = 5) -> str:
"""Search for courses and return formatted response"""
# Preprocess query
query = self.preprocess_text(query)
# Generate query embedding
query_embedding = self.model.encode([query])[0]
# Calculate similarities
similarities = np.dot(self.course_embeddings, query_embedding)
# Get top k results
top_indices = np.argsort(similarities)[-top_k:][::-1]
results = []
for idx in top_indices:
course = self.courses_df.iloc[idx]
results.append({
'course_name': course['Course Name'],
'key_takeaways': course['Key Takeaways'],
'course_time': course['Course Time'],
'ratings': course['Ratings'],
'difficulty': course['Difficulty'],
'similarity_score': similarities[idx],
'url': course['Website']
})
# Generate formatted response
return self.generate_response(query, results)
def test_search_system(df: pd.DataFrame):
"""Test the search system with sample queries"""
search_system = CourseSearchSystem()
search_system.load_and_prepare_data(df)
test_queries = [
"machine learning for beginners",
"natural language processing",
"computer vision courses",
"data preprocessing tutorials",
"generative AI learning"
]
for query in test_queries:
print(f"\nTesting query: '{query}'\n")
response = search_system.search_courses(query, top_k=3)
print(response)
print("\n" + "="*80 + "\n")
if __name__ == "__main__":
df = pd.read_csv('course_data.csv')
test_search_system(df)