Spaces:

nxyan
/

analytics-vidhya-course-search

Sleeping

App Files Files Community

analytics-vidhya-course-search / course_scraper.py

nxyan

Update course_scraper.py

47cb7b2 verified 6 months ago

raw

history blame contribute delete

5.16 kB

	import requests
	from bs4 import BeautifulSoup
	import json
	import logging
	from dataclasses import dataclass, asdict
	from typing import List, Optional

	@dataclass
	class Course:
	title: str
	url: str
	categories: List[str]
	rating_count: int
	lesson_count: int
	price: str
	image_url: str

	class AnalyticsVidhyaScraper:
	def __init__(self):
	self.base_url = "https://courses.analyticsvidhya.com"
	self.free_courses_url = f"{self.base_url}/pages/all-free-courses"
	self.headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
	'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,/;q=0.8'
	}
	self.setup_logging()

	def setup_logging(self):
	logging.basicConfig(
	level=logging.INFO,
	format='%(asctime)s - %(levelname)s - %(message)s',
	handlers=[
	logging.FileHandler('scraper.log'),
	logging.StreamHandler()
	]
	)
	self.logger = logging.getLogger(__name__)

	def get_page_content(self) -> Optional[BeautifulSoup]:
	try:
	response = requests.get(self.free_courses_url, headers=self.headers, timeout=10)
	response.raise_for_status()
	soup = BeautifulSoup(response.text, 'html.parser')

	self.logger.debug(f"First 500 characters of HTML: {response.text[:500]}")

	return soup
	except Exception as e:
	self.logger.error(f"Error fetching page: {str(e)}")
	return None

	def extract_course_info(self, course_card) -> Optional[Course]:
	try:
	url = course_card.get('href', '')
	if url and not url.startswith('http'):
	url = self.base_url + url

	h4_tag = course_card.find('h4')
	categories = [cat.strip() for cat in h4_tag.text.split(',') if cat.strip()] if h4_tag else []

	title = course_card.find('h3').text.strip() if course_card.find('h3') else ''

	rating_span = course_card.find('span', class_='review__stars-count')
	rating_count = int(rating_span.text.strip('()')) if rating_span else 0

	lesson_count_span = course_card.find('span', class_='course-card__lesson-count')
	lesson_count = int(lesson_count_span.find('strong').text.split()[0]) if lesson_count_span else 0

	price_span = course_card.find('span', class_='course-card__price')
	price = price_span.find('strong').text.strip() if price_span else ''

	img_tag = course_card.find('img', class_='course-card__img')
	image_url = img_tag.get('src', '') if img_tag else ''

	return Course(
	title=title,
	url=url,
	categories=categories,
	rating_count=rating_count,
	lesson_count=lesson_count,
	price=price,
	image_url=image_url
	)
	except Exception as e:
	self.logger.error(f"Error extracting course info: {str(e)}")
	return None

	def scrape_courses(self) -> List[Course]:
	soup = self.get_page_content()
	if not soup:
	return []

	courses = []
	sections = soup.find_all('article', class_='section__content')
	self.logger.info(f"Found {len(sections)} article sections")

	for section in sections:
	course_cards = section.find_all('a', class_='course-card')
	self.logger.info(f"Found {len(course_cards)} course cards in section")

	for card in course_cards:
	course = self.extract_course_info(card)
	if course:
	courses.append(course)
	self.logger.info(f"Extracted course: {course.title}")

	return courses

	def save_courses(self, courses: List[Course], filename: str = 'courses.json'):
	try:
	with open(filename, 'w', encoding='utf-8') as f:
	json.dump([asdict(course) for course in courses], f, indent=2, ensure_ascii=False)
	self.logger.info(f"Successfully saved {len(courses)} courses to {filename}")
	except Exception as e:
	self.logger.error(f"Error saving courses: {str(e)}")

	def main():
	scraper = AnalyticsVidhyaScraper()
	scraper.logger.info("Starting course scraping")

	courses = scraper.scrape_courses()
	if courses:
	scraper.save_courses(courses)
	print(f"Successfully scraped {len(courses)} courses")

	print("\nSample of scraped courses:")
	for course in courses[:3]: # Show first 3 courses
	print(f"\nTitle: {course.title}")
	print(f"Categories: {', '.join(course.categories)}")
	print(f"Lessons: {course.lesson_count}")
	print(f"Rating Count: {course.rating_count}")
	print(f"Price: {course.price}")
	print(f"URL: {course.url}")
	else:
	print("No courses were found. Check the logs for details.")

	if __name__ == "__main__":
	main()