Spaces:
Sleeping
Sleeping
import streamlit as st | |
import requests | |
from bs4 import BeautifulSoup | |
import pandas as pd | |
from datetime import datetime, timedelta | |
import re | |
import base64 | |
def scrape_page(url): | |
response = requests.get(url) | |
if response.status_code == 200: | |
soup = BeautifulSoup(response.text, 'html.parser') | |
articles = soup.find_all('article') | |
data = [] | |
for article in articles: | |
title = article.find('h2').text.strip() | |
category = article.find('span', class_='label') | |
time = article.find('span', class_='date').text.strip() | |
data.append({'Title': title, 'Category': category, 'Time': time}) | |
return data | |
else: | |
st.error(f"Failed to retrieve the webpage. Status code: {response.status_code}") | |
return [] | |
def scrape_multiple_pages(query, num_pages): | |
base_url = f"https://www.cnbcindonesia.com/search?query={query}" | |
all_data = [] | |
for page_num in range(1, num_pages + 1): | |
url = f"{base_url}&page={page_num}" | |
st.write(f"Scraping page {page_num} - {url}") | |
page_data = scrape_page(url) | |
all_data.extend(page_data) | |
return all_data | |
# Streamlit app | |
st.title("CNBC Indonesia Scraper") | |
# Accept user input for the query and number of pages | |
query = st.text_input("Enter the search query:") | |
num_pages = st.number_input("Select the number of pages to scrape:", 1, 10000, 5) | |
# Display scraping progress | |
if st.button("Scrape Data"): | |
df = pd.DataFrame(scrape_multiple_pages(query, num_pages)) | |
# Split 'Time' into 'Category' and 'Time_info' | |
df[['Category', 'Time_info']] = df['Time'].str.split('-', n=1, expand=True) | |
# Remove unnecessary spaces from the 'Time_info' column | |
df['Time_info'] = df['Time_info'].str.strip() | |
# Drop the original 'Time' column | |
df = df.drop('Time', axis=1) | |
# Extracting the date from the Time_info column | |
dates = [] | |
for time_info in df["Time_info"]: | |
match = re.search(r'(\d+) (\w+) yang lalu', time_info) | |
if match: | |
quantity, unit = int(match.group(1)), match.group(2) | |
if unit == "jam": | |
date = datetime.now() - timedelta(hours=quantity) | |
elif unit == "hari": | |
date = datetime.now() - timedelta(days=quantity) | |
dates.append(date.strftime("%Y-%m-%d")) | |
# Adding the extracted date to the original DataFrame | |
df["Date"] = dates | |
# Display the DataFrame | |
st.write(df) | |
# Save the processed data to a CSV file on button click | |
st.download_button( | |
label= 'Download csv', | |
data= df, | |
file_name= f'{query}_cncb.csv' | |
) |