Spaces:
Sleeping
Sleeping
laxminarasimha6
commited on
Commit
•
cdc9adf
1
Parent(s):
28d6d8a
Upload 3 files
Browse files- courses.db +0 -0
- main.py +78 -0
- requirements.txt +7 -0
courses.db
ADDED
Binary file (20.5 kB). View file
|
|
main.py
ADDED
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import requests
|
3 |
+
from bs4 import BeautifulSoup
|
4 |
+
import sqlite3
|
5 |
+
from dotenv import load_dotenv
|
6 |
+
|
7 |
+
# Load environment variables
|
8 |
+
load_dotenv()
|
9 |
+
|
10 |
+
|
11 |
+
# Database setup
|
12 |
+
def create_database():
|
13 |
+
conn = sqlite3.connect('courses.db')
|
14 |
+
c = conn.cursor()
|
15 |
+
# Create table with an additional column 'price' to indicate free or paid
|
16 |
+
c.execute('''CREATE TABLE IF NOT EXISTS courses
|
17 |
+
(id INTEGER PRIMARY KEY, title TEXT, description TEXT, price TEXT)''')
|
18 |
+
conn.commit()
|
19 |
+
conn.close()
|
20 |
+
|
21 |
+
|
22 |
+
# Web scraping function to get course data from a specific page
|
23 |
+
def scrape_courses_from_page(page_number):
|
24 |
+
url = f"https://courses.analyticsvidhya.com/collections/courses?page={page_number}"
|
25 |
+
response = requests.get(url)
|
26 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
27 |
+
|
28 |
+
courses = []
|
29 |
+
|
30 |
+
# Find all course items from the products__list
|
31 |
+
course_items = soup.find_all('li', class_='products__list-item')
|
32 |
+
|
33 |
+
for course_item in course_items:
|
34 |
+
# Extract the course title
|
35 |
+
title_tag = course_item.find('h3')
|
36 |
+
title = title_tag.text.strip() if title_tag else 'No title available'
|
37 |
+
|
38 |
+
# Extract course price
|
39 |
+
price_tag = course_item.find('span', class_='course-card__price')
|
40 |
+
price = price_tag.text.strip() if price_tag else "Price not available"
|
41 |
+
|
42 |
+
# Description is not always explicitly provided, using course body for more details
|
43 |
+
description_tag = course_item.find('h4')
|
44 |
+
description = description_tag.text.strip() if description_tag else 'No description available'
|
45 |
+
|
46 |
+
# Append course details (title, description, price)
|
47 |
+
courses.append((title, description, price))
|
48 |
+
|
49 |
+
return courses
|
50 |
+
|
51 |
+
|
52 |
+
# Scrape all pages (total 8 pages) and insert data into the database
|
53 |
+
def scrape_all_pages():
|
54 |
+
all_courses = []
|
55 |
+
|
56 |
+
# Loop through pages 1 to 8
|
57 |
+
for page in range(1, 9):
|
58 |
+
print(f"Scraping page {page}...")
|
59 |
+
courses = scrape_courses_from_page(page)
|
60 |
+
all_courses.extend(courses)
|
61 |
+
|
62 |
+
return all_courses
|
63 |
+
|
64 |
+
|
65 |
+
# Insert scraped data into the database
|
66 |
+
def insert_data_to_db(courses):
|
67 |
+
conn = sqlite3.connect('courses.db')
|
68 |
+
c = conn.cursor()
|
69 |
+
c.executemany('INSERT INTO courses (title, description, price) VALUES (?, ?, ?)', courses)
|
70 |
+
conn.commit()
|
71 |
+
conn.close()
|
72 |
+
|
73 |
+
|
74 |
+
if __name__ == "__main__":
|
75 |
+
create_database()
|
76 |
+
all_courses = scrape_all_pages()
|
77 |
+
insert_data_to_db(all_courses)
|
78 |
+
print(f"Data from all pages has been successfully scraped and inserted into the database.")
|
requirements.txt
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
requests
|
2 |
+
beautifulsoup4
|
3 |
+
gradio
|
4 |
+
streamlit
|
5 |
+
sqlite3
|
6 |
+
python-dotenv
|
7 |
+
openai
|