File size: 2,658 Bytes
c760e2d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65570f8
c760e2d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
657fa5e
a7826e6
b8ebc3d
a7826e6
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import streamlit as st
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime, timedelta
import re
import base64

def scrape_page(url):
    response = requests.get(url)
    
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        articles = soup.find_all('article')

        data = []

        for article in articles:
            title = article.find('h2').text.strip()
            category = article.find('span', class_='label')
            time = article.find('span', class_='date').text.strip()

            data.append({'Title': title, 'Category': category, 'Time': time})

        return data

    else:
        st.error(f"Failed to retrieve the webpage. Status code: {response.status_code}")
        return []

def scrape_multiple_pages(query, num_pages):
    base_url = f"https://www.cnbcindonesia.com/search?query={query}"

    all_data = []

    for page_num in range(1, num_pages + 1):
        url = f"{base_url}&page={page_num}"
        st.write(f"Scraping page {page_num} - {url}")
        page_data = scrape_page(url)
        all_data.extend(page_data)

    return all_data

# Streamlit app
st.title("CNBC Indonesia Scraper")

# Accept user input for the query and number of pages
query = st.text_input("Enter the search query:")
num_pages = st.number_input("Select the number of pages to scrape:", 1, 10000, 5)

# Display scraping progress
if st.button("Scrape Data"):
    df = pd.DataFrame(scrape_multiple_pages(query, num_pages))

    # Split 'Time' into 'Category' and 'Time_info'
    df[['Category', 'Time_info']] = df['Time'].str.split('-', n=1, expand=True)

    # Remove unnecessary spaces from the 'Time_info' column
    df['Time_info'] = df['Time_info'].str.strip()

    # Drop the original 'Time' column
    df = df.drop('Time', axis=1)

    # Extracting the date from the Time_info column
    dates = []
    for time_info in df["Time_info"]:
        match = re.search(r'(\d+) (\w+) yang lalu', time_info)
        if match:
            quantity, unit = int(match.group(1)), match.group(2)
            if unit == "jam":
                date = datetime.now() - timedelta(hours=quantity)
            elif unit == "hari":
                date = datetime.now() - timedelta(days=quantity)
            dates.append(date.strftime("%Y-%m-%d"))

    # Adding the extracted date to the original DataFrame
    df["Date"] = dates

    # Display the DataFrame
    st.write(df)

    # Save the processed data to a CSV file on button click
    st.download_button(
        label= 'Download csv',
        data= df,
        file_name= f'{query}_cncb.csv'
        )