detik_scrape / app.py
destiratnakomala's picture
Create app.py
7f7f8e9 verified
import requests
from bs4 import BeautifulSoup
import csv
import re
from datetime import datetime
import streamlit as st
import pandas as pd
import time
def scrape_detik_search(query, num_pages):
base_url = "https://www.detik.com/search/searchnews"
all_results = []
for page in range(1, num_pages + 1):
params = {
"query": query,
"sortby": "time",
"page": page
}
response = requests.get(base_url, params=params)
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'html.parser')
articles = soup.find_all('article')
for article in articles:
title = article.find('h2', class_='title').text.strip()
# Extract date and time using regular expressions
date_pattern = r'\d{1,2} [a-zA-Z]+ \d{4}'
time_pattern = r'\d{2}:\d{2}'
date_match = re.search(date_pattern, article.find('span', class_='date').text.strip())
time_match = re.search(time_pattern, article.find('span', class_='date').text.strip())
if date_match and time_match:
extracted_date = date_match.group()
extracted_time = time_match.group()
# # Convert the extracted date string to a datetime object
# datetime_object = datetime.strptime(extracted_date, '%d %b %Y')
# # Create a formatted date string
# formatted_date = datetime_object.strftime('%Y-%m-%d')
link = article.find('a')['href']
category = article.find('span', class_='category').text.strip()
all_results.append([title, link, category, extracted_date, extracted_time])
with st.spinner('Mohon Ditunggu...'):
time.sleep(1)
else:
st.write(f"Error {response.status_code}: Unable to retrieve data from Page {page}.")
return all_results
def save_to_csv(results, filename):
with open(filename, mode='w', newline='', encoding='utf-8') as file:
writer = csv.writer(file)
# Write the header
writer.writerow(["Judul", "Link", "Kategori", "Tanggal", "Waktu"])
# Write the data
writer.writerows(results)
# Streamlit app
st.title("Detik.com Search Results Scraper")
# Get user input for search query and number of pages
search_query = st.sidebar.text_input("Enter the search query:")
num_pages_to_scrape = st.sidebar.number_input("Select the number of pages to scrape:", 1, 100, 1)
# Scrape Detik.com search results
if st.sidebar.button("Scrape Results"):
search_results = scrape_detik_search(search_query, num_pages_to_scrape)
with st.expander('Data Preview'):
df= pd.DataFrame(search_results, columns= ["Judul", "Link", "Kategori", "Tanggal", "Waktu"])
st.dataframe(df)
# Save results to a CSV file
csv_filename = f"{search_query.replace(' ', '_')}_detik_search_results.csv"
save_to_csv(search_results, csv_filename)
st.success(f"Results saved to {csv_filename}")