Spaces:

destiratnakomala
/

cncb_scraper

Sleeping

App Files Files Community

cncb_scraper / app.py

destiratnakomala

Update app.py

65570f8 verified over 1 year ago

raw

history blame contribute delete

2.66 kB

	import streamlit as st
	import requests
	from bs4 import BeautifulSoup
	import pandas as pd
	from datetime import datetime, timedelta
	import re
	import base64

	def scrape_page(url):
	response = requests.get(url)

	if response.status_code == 200:
	soup = BeautifulSoup(response.text, 'html.parser')
	articles = soup.find_all('article')

	data = []

	for article in articles:
	title = article.find('h2').text.strip()
	category = article.find('span', class_='label')
	time = article.find('span', class_='date').text.strip()

	data.append({'Title': title, 'Category': category, 'Time': time})

	return data

	else:
	st.error(f"Failed to retrieve the webpage. Status code: {response.status_code}")
	return []

	def scrape_multiple_pages(query, num_pages):
	base_url = f"https://www.cnbcindonesia.com/search?query={query}"

	all_data = []

	for page_num in range(1, num_pages + 1):
	url = f"{base_url}&page={page_num}"
	st.write(f"Scraping page {page_num} - {url}")
	page_data = scrape_page(url)
	all_data.extend(page_data)

	return all_data

	# Streamlit app
	st.title("CNBC Indonesia Scraper")

	# Accept user input for the query and number of pages
	query = st.text_input("Enter the search query:")
	num_pages = st.number_input("Select the number of pages to scrape:", 1, 10000, 5)

	# Display scraping progress
	if st.button("Scrape Data"):
	df = pd.DataFrame(scrape_multiple_pages(query, num_pages))

	# Split 'Time' into 'Category' and 'Time_info'
	df[['Category', 'Time_info']] = df['Time'].str.split('-', n=1, expand=True)

	# Remove unnecessary spaces from the 'Time_info' column
	df['Time_info'] = df['Time_info'].str.strip()

	# Drop the original 'Time' column
	df = df.drop('Time', axis=1)

	# Extracting the date from the Time_info column
	dates = []
	for time_info in df["Time_info"]:
	match = re.search(r'(\d+) (\w+) yang lalu', time_info)
	if match:
	quantity, unit = int(match.group(1)), match.group(2)
	if unit == "jam":
	date = datetime.now() - timedelta(hours=quantity)
	elif unit == "hari":
	date = datetime.now() - timedelta(days=quantity)
	dates.append(date.strftime("%Y-%m-%d"))

	# Adding the extracted date to the original DataFrame
	df["Date"] = dates

	# Display the DataFrame
	st.write(df)

	# Save the processed data to a CSV file on button click
	st.download_button(
	label= 'Download csv',
	data= df,
	file_name= f'{query}_cncb.csv'
	)