import requests from bs4 import BeautifulSoup import csv import re from datetime import datetime import streamlit as st import pandas as pd import time def scrape_detik_search(query, num_pages): base_url = "https://www.detik.com/search/searchnews" all_results = [] for page in range(1, num_pages + 1): params = { "query": query, "sortby": "time", "page": page } response = requests.get(base_url, params=params) if response.status_code == 200: soup = BeautifulSoup(response.text, 'html.parser') articles = soup.find_all('article') for article in articles: title = article.find('h2', class_='title').text.strip() # Extract date and time using regular expressions date_pattern = r'\d{1,2} [a-zA-Z]+ \d{4}' time_pattern = r'\d{2}:\d{2}' date_match = re.search(date_pattern, article.find('span', class_='date').text.strip()) time_match = re.search(time_pattern, article.find('span', class_='date').text.strip()) if date_match and time_match: extracted_date = date_match.group() extracted_time = time_match.group() # # Convert the extracted date string to a datetime object # datetime_object = datetime.strptime(extracted_date, '%d %b %Y') # # Create a formatted date string # formatted_date = datetime_object.strftime('%Y-%m-%d') link = article.find('a')['href'] category = article.find('span', class_='category').text.strip() all_results.append([title, link, category, extracted_date, extracted_time]) with st.spinner('Mohon Ditunggu...'): time.sleep(1) else: st.write(f"Error {response.status_code}: Unable to retrieve data from Page {page}.") return all_results def save_to_csv(results, filename): with open(filename, mode='w', newline='', encoding='utf-8') as file: writer = csv.writer(file) # Write the header writer.writerow(["Judul", "Link", "Kategori", "Tanggal", "Waktu"]) # Write the data writer.writerows(results) # Streamlit app st.title("Detik.com Search Results Scraper") # Get user input for search query and number of pages search_query = st.sidebar.text_input("Enter the search query:") num_pages_to_scrape = st.sidebar.number_input("Select the number of pages to scrape:", 1, 100, 1) # Scrape Detik.com search results if st.sidebar.button("Scrape Results"): search_results = scrape_detik_search(search_query, num_pages_to_scrape) with st.expander('Data Preview'): df= pd.DataFrame(search_results, columns= ["Judul", "Link", "Kategori", "Tanggal", "Waktu"]) st.dataframe(df) # Save results to a CSV file csv_filename = f"{search_query.replace(' ', '_')}_detik_search_results.csv" save_to_csv(search_results, csv_filename) st.success(f"Results saved to {csv_filename}")