RAGbasedcoursesearch / scraper.py
Vishal1806's picture
new
404e92b verified
raw
history blame
1.7 kB
import requests
from bs4 import BeautifulSoup
import pandas as pd
def scrape_courses():
url = "https://courses.analyticsvidhya.com/pages/all-free-courses" # Use the actual URL where the courses are listed
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
courses = []
for course_card in soup.find_all("a", class_="card-link"):
# Extract title
title = course_card.find("h2", class_="card-text").get_text(strip=True)
# Extract duration and lessons (if available)
duration_lessons = course_card.find("p", class_="fs-14").get_text(strip=True)
duration, lessons = duration_lessons.split(" Hours")[0] + " Hours", duration_lessons.split(" Hours")[1].strip()
# Extract review rating (if available)
rating = course_card.find("span", class_="rating") # Assuming rating is in a span with class "rating"
rating = rating.get_text(strip=True) if rating else "No rating"
# Extract price (if available)
price = course_card.find("span", class_="price") # Assuming price is in a span with class "price"
price = price.get_text(strip=True) if price else "Free" # Assuming courses without price are free
courses.append({
'title': title,
'duration': duration,
'lessons': lessons,
'rating': rating,
'price': price
})
return pd.DataFrame(courses)
# Run the scraper and save the data to a CSV
if __name__ == "__main__":
courses_df = scrape_courses()
print(courses_df) # Preview the scraped data
courses_df.to_csv("courses_data.csv", index=False)