import requests from bs4 import BeautifulSoup import json import logging from dataclasses import dataclass, asdict from typing import List, Optional @dataclass class Course: title: str url: str categories: List[str] rating_count: int lesson_count: int price: str image_url: str class AnalyticsVidhyaScraper: def __init__(self): self.base_url = "https://courses.analyticsvidhya.com" self.free_courses_url = f"{self.base_url}/pages/all-free-courses" self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' } self.setup_logging() def setup_logging(self): logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', handlers=[ logging.FileHandler('scraper.log'), logging.StreamHandler() ] ) self.logger = logging.getLogger(__name__) def get_page_content(self) -> Optional[BeautifulSoup]: try: response = requests.get(self.free_courses_url, headers=self.headers, timeout=10) response.raise_for_status() soup = BeautifulSoup(response.text, 'html.parser') self.logger.debug(f"First 500 characters of HTML: {response.text[:500]}") return soup except Exception as e: self.logger.error(f"Error fetching page: {str(e)}") return None def extract_course_info(self, course_card) -> Optional[Course]: try: url = course_card.get('href', '') if url and not url.startswith('http'): url = self.base_url + url h4_tag = course_card.find('h4') categories = [cat.strip() for cat in h4_tag.text.split(',') if cat.strip()] if h4_tag else [] title = course_card.find('h3').text.strip() if course_card.find('h3') else '' rating_span = course_card.find('span', class_='review__stars-count') rating_count = int(rating_span.text.strip('()')) if rating_span else 0 lesson_count_span = course_card.find('span', class_='course-card__lesson-count') lesson_count = int(lesson_count_span.find('strong').text.split()[0]) if lesson_count_span else 0 price_span = course_card.find('span', class_='course-card__price') price = price_span.find('strong').text.strip() if price_span else '' img_tag = course_card.find('img', class_='course-card__img') image_url = img_tag.get('src', '') if img_tag else '' return Course( title=title, url=url, categories=categories, rating_count=rating_count, lesson_count=lesson_count, price=price, image_url=image_url ) except Exception as e: self.logger.error(f"Error extracting course info: {str(e)}") return None def scrape_courses(self) -> List[Course]: soup = self.get_page_content() if not soup: return [] courses = [] sections = soup.find_all('article', class_='section__content') self.logger.info(f"Found {len(sections)} article sections") for section in sections: course_cards = section.find_all('a', class_='course-card') self.logger.info(f"Found {len(course_cards)} course cards in section") for card in course_cards: course = self.extract_course_info(card) if course: courses.append(course) self.logger.info(f"Extracted course: {course.title}") return courses def save_courses(self, courses: List[Course], filename: str = 'courses.json'): try: with open(filename, 'w', encoding='utf-8') as f: json.dump([asdict(course) for course in courses], f, indent=2, ensure_ascii=False) self.logger.info(f"Successfully saved {len(courses)} courses to {filename}") except Exception as e: self.logger.error(f"Error saving courses: {str(e)}") def main(): scraper = AnalyticsVidhyaScraper() scraper.logger.info("Starting course scraping") courses = scraper.scrape_courses() if courses: scraper.save_courses(courses) print(f"Successfully scraped {len(courses)} courses") print("\nSample of scraped courses:") for course in courses[:3]: # Show first 3 courses print(f"\nTitle: {course.title}") print(f"Categories: {', '.join(course.categories)}") print(f"Lessons: {course.lesson_count}") print(f"Rating Count: {course.rating_count}") print(f"Price: {course.price}") print(f"URL: {course.url}") else: print("No courses were found. Check the logs for details.") if __name__ == "__main__": main()