Spaces:

destiratnakomala
/

cncb_scraper

Sleeping

File size: 2,658 Bytes

import streamlit as st
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime, timedelta
import re
import base64

def scrape_page(url):
    response = requests.get(url)
    
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        articles = soup.find_all('article')

        data = []

        for article in articles:
            title = article.find('h2').text.strip()
            category = article.find('span', class_='label')
            time = article.find('span', class_='date').text.strip()

            data.append({'Title': title, 'Category': category, 'Time': time})

        return data

    else:
        st.error(f"Failed to retrieve the webpage. Status code: {response.status_code}")
        return []

def scrape_multiple_pages(query, num_pages):
    base_url = f"https://www.cnbcindonesia.com/search?query={query}"

    all_data = []

    for page_num in range(1, num_pages + 1):
        url = f"{base_url}&page={page_num}"
        st.write(f"Scraping page {page_num} - {url}")
        page_data = scrape_page(url)
        all_data.extend(page_data)

    return all_data

# Streamlit app
st.title("CNBC Indonesia Scraper")

# Accept user input for the query and number of pages
query = st.text_input("Enter the search query:")
num_pages = st.number_input("Select the number of pages to scrape:", 1, 10000, 5)

# Display scraping progress
if st.button("Scrape Data"):
    df = pd.DataFrame(scrape_multiple_pages(query, num_pages))

    # Split 'Time' into 'Category' and 'Time_info'
    df[['Category', 'Time_info']] = df['Time'].str.split('-', n=1, expand=True)

    # Remove unnecessary spaces from the 'Time_info' column
    df['Time_info'] = df['Time_info'].str.strip()

    # Drop the original 'Time' column
    df = df.drop('Time', axis=1)

    # Extracting the date from the Time_info column
    dates = []
    for time_info in df["Time_info"]:
        match = re.search(r'(\d+) (\w+) yang lalu', time_info)
        if match:
            quantity, unit = int(match.group(1)), match.group(2)
            if unit == "jam":
                date = datetime.now() - timedelta(hours=quantity)
            elif unit == "hari":
                date = datetime.now() - timedelta(days=quantity)
            dates.append(date.strftime("%Y-%m-%d"))

    # Adding the extracted date to the original DataFrame
    df["Date"] = dates

    # Display the DataFrame
    st.write(df)

    # Save the processed data to a CSV file on button click
    st.download_button(
        label= 'Download csv',
        data= df,
        file_name= f'{query}_cncb.csv'
        )