Spaces:

destiratnakomala
/

detik_scrape

Sleeping

App Files Files Community

destiratnakomala commited on Feb 4

Commit

7f7f8e9

•

1 Parent(s): b084bc5

Create app.py

Browse files

Files changed (1) hide show

app.py +87 -0

app.py ADDED Viewed

	@@ -0,0 +1,87 @@

+import requests
+from bs4 import BeautifulSoup
+import csv
+import re
+from datetime import datetime
+import streamlit as st
+import pandas as pd
+import time
+def scrape_detik_search(query, num_pages):
+    base_url = "https://www.detik.com/search/searchnews"
+    all_results = []
+    for page in range(1, num_pages + 1):
+        params = {
+            "query": query,
+            "sortby": "time",
+            "page": page
+        }
+        response = requests.get(base_url, params=params)
+        if response.status_code == 200:
+            soup = BeautifulSoup(response.text, 'html.parser')
+            articles = soup.find_all('article')
+            for article in articles:
+                title = article.find('h2', class_='title').text.strip()
+                # Extract date and time using regular expressions
+                date_pattern = r'\d{1,2} [a-zA-Z]+ \d{4}'
+                time_pattern = r'\d{2}:\d{2}'
+                date_match = re.search(date_pattern, article.find('span', class_='date').text.strip())
+                time_match = re.search(time_pattern, article.find('span', class_='date').text.strip())
+                if date_match and time_match:
+                    extracted_date = date_match.group()
+                    extracted_time = time_match.group()
+                    # # Convert the extracted date string to a datetime object
+                    # datetime_object = datetime.strptime(extracted_date, '%d %b %Y')
+                    # # Create a formatted date string
+                    # formatted_date = datetime_object.strftime('%Y-%m-%d')
+                    link = article.find('a')['href']
+                    category = article.find('span', class_='category').text.strip()
+                    all_results.append([title, link, category, extracted_date, extracted_time])
+            with st.spinner('Mohon Ditunggu...'):
+                time.sleep(1)
+        else:
+            st.write(f"Error {response.status_code}: Unable to retrieve data from Page {page}.")
+    return all_results
+def save_to_csv(results, filename):
+    with open(filename, mode='w', newline='', encoding='utf-8') as file:
+        writer = csv.writer(file)
+        # Write the header
+        writer.writerow(["Judul", "Link", "Kategori", "Tanggal", "Waktu"])
+        # Write the data
+        writer.writerows(results)
+# Streamlit app
+st.title("Detik.com Search Results Scraper")
+# Get user input for search query and number of pages
+search_query = st.sidebar.text_input("Enter the search query:")
+num_pages_to_scrape = st.sidebar.number_input("Select the number of pages to scrape:", 1, 100, 1)
+# Scrape Detik.com search results
+if st.sidebar.button("Scrape Results"):
+    search_results = scrape_detik_search(search_query, num_pages_to_scrape)
+    with st.expander('Data Preview'):
+        df= pd.DataFrame(search_results, columns= ["Judul", "Link", "Kategori", "Tanggal", "Waktu"])
+        st.dataframe(df)
+    # Save results to a CSV file
+    csv_filename = f"{search_query.replace(' ', '_')}_detik_search_results.csv"
+    save_to_csv(search_results, csv_filename)
+    st.success(f"Results saved to {csv_filename}")