analytics-vidhya-course-search / course_scraper.py
nxyan's picture
Update course_scraper.py
47cb7b2 verified
import requests
from bs4 import BeautifulSoup
import json
import logging
from dataclasses import dataclass, asdict
from typing import List, Optional
@dataclass
class Course:
title: str
url: str
categories: List[str]
rating_count: int
lesson_count: int
price: str
image_url: str
class AnalyticsVidhyaScraper:
def __init__(self):
self.base_url = "https://courses.analyticsvidhya.com"
self.free_courses_url = f"{self.base_url}/pages/all-free-courses"
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
}
self.setup_logging()
def setup_logging(self):
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler('scraper.log'),
logging.StreamHandler()
]
)
self.logger = logging.getLogger(__name__)
def get_page_content(self) -> Optional[BeautifulSoup]:
try:
response = requests.get(self.free_courses_url, headers=self.headers, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
self.logger.debug(f"First 500 characters of HTML: {response.text[:500]}")
return soup
except Exception as e:
self.logger.error(f"Error fetching page: {str(e)}")
return None
def extract_course_info(self, course_card) -> Optional[Course]:
try:
url = course_card.get('href', '')
if url and not url.startswith('http'):
url = self.base_url + url
h4_tag = course_card.find('h4')
categories = [cat.strip() for cat in h4_tag.text.split(',') if cat.strip()] if h4_tag else []
title = course_card.find('h3').text.strip() if course_card.find('h3') else ''
rating_span = course_card.find('span', class_='review__stars-count')
rating_count = int(rating_span.text.strip('()')) if rating_span else 0
lesson_count_span = course_card.find('span', class_='course-card__lesson-count')
lesson_count = int(lesson_count_span.find('strong').text.split()[0]) if lesson_count_span else 0
price_span = course_card.find('span', class_='course-card__price')
price = price_span.find('strong').text.strip() if price_span else ''
img_tag = course_card.find('img', class_='course-card__img')
image_url = img_tag.get('src', '') if img_tag else ''
return Course(
title=title,
url=url,
categories=categories,
rating_count=rating_count,
lesson_count=lesson_count,
price=price,
image_url=image_url
)
except Exception as e:
self.logger.error(f"Error extracting course info: {str(e)}")
return None
def scrape_courses(self) -> List[Course]:
soup = self.get_page_content()
if not soup:
return []
courses = []
sections = soup.find_all('article', class_='section__content')
self.logger.info(f"Found {len(sections)} article sections")
for section in sections:
course_cards = section.find_all('a', class_='course-card')
self.logger.info(f"Found {len(course_cards)} course cards in section")
for card in course_cards:
course = self.extract_course_info(card)
if course:
courses.append(course)
self.logger.info(f"Extracted course: {course.title}")
return courses
def save_courses(self, courses: List[Course], filename: str = 'courses.json'):
try:
with open(filename, 'w', encoding='utf-8') as f:
json.dump([asdict(course) for course in courses], f, indent=2, ensure_ascii=False)
self.logger.info(f"Successfully saved {len(courses)} courses to {filename}")
except Exception as e:
self.logger.error(f"Error saving courses: {str(e)}")
def main():
scraper = AnalyticsVidhyaScraper()
scraper.logger.info("Starting course scraping")
courses = scraper.scrape_courses()
if courses:
scraper.save_courses(courses)
print(f"Successfully scraped {len(courses)} courses")
print("\nSample of scraped courses:")
for course in courses[:3]: # Show first 3 courses
print(f"\nTitle: {course.title}")
print(f"Categories: {', '.join(course.categories)}")
print(f"Lessons: {course.lesson_count}")
print(f"Rating Count: {course.rating_count}")
print(f"Price: {course.price}")
print(f"URL: {course.url}")
else:
print("No courses were found. Check the logs for details.")
if __name__ == "__main__":
main()