cncb_scraper / app.py
destiratnakomala's picture
Update app.py
65570f8 verified
import streamlit as st
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime, timedelta
import re
import base64
def scrape_page(url):
response = requests.get(url)
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'html.parser')
articles = soup.find_all('article')
data = []
for article in articles:
title = article.find('h2').text.strip()
category = article.find('span', class_='label')
time = article.find('span', class_='date').text.strip()
data.append({'Title': title, 'Category': category, 'Time': time})
return data
else:
st.error(f"Failed to retrieve the webpage. Status code: {response.status_code}")
return []
def scrape_multiple_pages(query, num_pages):
base_url = f"https://www.cnbcindonesia.com/search?query={query}"
all_data = []
for page_num in range(1, num_pages + 1):
url = f"{base_url}&page={page_num}"
st.write(f"Scraping page {page_num} - {url}")
page_data = scrape_page(url)
all_data.extend(page_data)
return all_data
# Streamlit app
st.title("CNBC Indonesia Scraper")
# Accept user input for the query and number of pages
query = st.text_input("Enter the search query:")
num_pages = st.number_input("Select the number of pages to scrape:", 1, 10000, 5)
# Display scraping progress
if st.button("Scrape Data"):
df = pd.DataFrame(scrape_multiple_pages(query, num_pages))
# Split 'Time' into 'Category' and 'Time_info'
df[['Category', 'Time_info']] = df['Time'].str.split('-', n=1, expand=True)
# Remove unnecessary spaces from the 'Time_info' column
df['Time_info'] = df['Time_info'].str.strip()
# Drop the original 'Time' column
df = df.drop('Time', axis=1)
# Extracting the date from the Time_info column
dates = []
for time_info in df["Time_info"]:
match = re.search(r'(\d+) (\w+) yang lalu', time_info)
if match:
quantity, unit = int(match.group(1)), match.group(2)
if unit == "jam":
date = datetime.now() - timedelta(hours=quantity)
elif unit == "hari":
date = datetime.now() - timedelta(days=quantity)
dates.append(date.strftime("%Y-%m-%d"))
# Adding the extracted date to the original DataFrame
df["Date"] = dates
# Display the DataFrame
st.write(df)
# Save the processed data to a CSV file on button click
st.download_button(
label= 'Download csv',
data= df,
file_name= f'{query}_cncb.csv'
)