|
import requests |
|
from bs4 import BeautifulSoup |
|
import json |
|
import logging |
|
from dataclasses import dataclass, asdict |
|
from typing import List, Optional |
|
|
|
@dataclass |
|
class Course: |
|
title: str |
|
url: str |
|
categories: List[str] |
|
rating_count: int |
|
lesson_count: int |
|
price: str |
|
image_url: str |
|
|
|
class AnalyticsVidhyaScraper: |
|
def __init__(self): |
|
self.base_url = "https://courses.analyticsvidhya.com" |
|
self.free_courses_url = f"{self.base_url}/pages/all-free-courses" |
|
self.headers = { |
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36', |
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' |
|
} |
|
self.setup_logging() |
|
|
|
def setup_logging(self): |
|
logging.basicConfig( |
|
level=logging.INFO, |
|
format='%(asctime)s - %(levelname)s - %(message)s', |
|
handlers=[ |
|
logging.FileHandler('scraper.log'), |
|
logging.StreamHandler() |
|
] |
|
) |
|
self.logger = logging.getLogger(__name__) |
|
|
|
def get_page_content(self) -> Optional[BeautifulSoup]: |
|
try: |
|
response = requests.get(self.free_courses_url, headers=self.headers, timeout=10) |
|
response.raise_for_status() |
|
soup = BeautifulSoup(response.text, 'html.parser') |
|
|
|
self.logger.debug(f"First 500 characters of HTML: {response.text[:500]}") |
|
|
|
return soup |
|
except Exception as e: |
|
self.logger.error(f"Error fetching page: {str(e)}") |
|
return None |
|
|
|
def extract_course_info(self, course_card) -> Optional[Course]: |
|
try: |
|
url = course_card.get('href', '') |
|
if url and not url.startswith('http'): |
|
url = self.base_url + url |
|
|
|
h4_tag = course_card.find('h4') |
|
categories = [cat.strip() for cat in h4_tag.text.split(',') if cat.strip()] if h4_tag else [] |
|
|
|
title = course_card.find('h3').text.strip() if course_card.find('h3') else '' |
|
|
|
rating_span = course_card.find('span', class_='review__stars-count') |
|
rating_count = int(rating_span.text.strip('()')) if rating_span else 0 |
|
|
|
lesson_count_span = course_card.find('span', class_='course-card__lesson-count') |
|
lesson_count = int(lesson_count_span.find('strong').text.split()[0]) if lesson_count_span else 0 |
|
|
|
price_span = course_card.find('span', class_='course-card__price') |
|
price = price_span.find('strong').text.strip() if price_span else '' |
|
|
|
img_tag = course_card.find('img', class_='course-card__img') |
|
image_url = img_tag.get('src', '') if img_tag else '' |
|
|
|
return Course( |
|
title=title, |
|
url=url, |
|
categories=categories, |
|
rating_count=rating_count, |
|
lesson_count=lesson_count, |
|
price=price, |
|
image_url=image_url |
|
) |
|
except Exception as e: |
|
self.logger.error(f"Error extracting course info: {str(e)}") |
|
return None |
|
|
|
def scrape_courses(self) -> List[Course]: |
|
soup = self.get_page_content() |
|
if not soup: |
|
return [] |
|
|
|
courses = [] |
|
sections = soup.find_all('article', class_='section__content') |
|
self.logger.info(f"Found {len(sections)} article sections") |
|
|
|
for section in sections: |
|
course_cards = section.find_all('a', class_='course-card') |
|
self.logger.info(f"Found {len(course_cards)} course cards in section") |
|
|
|
for card in course_cards: |
|
course = self.extract_course_info(card) |
|
if course: |
|
courses.append(course) |
|
self.logger.info(f"Extracted course: {course.title}") |
|
|
|
return courses |
|
|
|
def save_courses(self, courses: List[Course], filename: str = 'courses.json'): |
|
try: |
|
with open(filename, 'w', encoding='utf-8') as f: |
|
json.dump([asdict(course) for course in courses], f, indent=2, ensure_ascii=False) |
|
self.logger.info(f"Successfully saved {len(courses)} courses to {filename}") |
|
except Exception as e: |
|
self.logger.error(f"Error saving courses: {str(e)}") |
|
|
|
def main(): |
|
scraper = AnalyticsVidhyaScraper() |
|
scraper.logger.info("Starting course scraping") |
|
|
|
courses = scraper.scrape_courses() |
|
if courses: |
|
scraper.save_courses(courses) |
|
print(f"Successfully scraped {len(courses)} courses") |
|
|
|
print("\nSample of scraped courses:") |
|
for course in courses[:3]: |
|
print(f"\nTitle: {course.title}") |
|
print(f"Categories: {', '.join(course.categories)}") |
|
print(f"Lessons: {course.lesson_count}") |
|
print(f"Rating Count: {course.rating_count}") |
|
print(f"Price: {course.price}") |
|
print(f"URL: {course.url}") |
|
else: |
|
print("No courses were found. Check the logs for details.") |
|
|
|
if __name__ == "__main__": |
|
main() |