File size: 3,147 Bytes
7f7f8e9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import requests
from bs4 import BeautifulSoup
import csv
import re
from datetime import datetime
import streamlit as st
import pandas as pd
import time

def scrape_detik_search(query, num_pages):
    base_url = "https://www.detik.com/search/searchnews"
    all_results = []

    for page in range(1, num_pages + 1):
        params = {
            "query": query,
            "sortby": "time",
            "page": page
        }

        response = requests.get(base_url, params=params)

        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            articles = soup.find_all('article')

            for article in articles:
                title = article.find('h2', class_='title').text.strip()

                # Extract date and time using regular expressions
                date_pattern = r'\d{1,2} [a-zA-Z]+ \d{4}'
                time_pattern = r'\d{2}:\d{2}'

                date_match = re.search(date_pattern, article.find('span', class_='date').text.strip())
                time_match = re.search(time_pattern, article.find('span', class_='date').text.strip())

                if date_match and time_match:
                    extracted_date = date_match.group()
                    extracted_time = time_match.group()

                    # # Convert the extracted date string to a datetime object
                    # datetime_object = datetime.strptime(extracted_date, '%d %b %Y')

                    # # Create a formatted date string
                    # formatted_date = datetime_object.strftime('%Y-%m-%d')

                    link = article.find('a')['href']
                    category = article.find('span', class_='category').text.strip()

                    all_results.append([title, link, category, extracted_date, extracted_time])

            with st.spinner('Mohon Ditunggu...'):
                time.sleep(1)

        else:
            st.write(f"Error {response.status_code}: Unable to retrieve data from Page {page}.")

    return all_results

def save_to_csv(results, filename):
    with open(filename, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        # Write the header
        writer.writerow(["Judul", "Link", "Kategori", "Tanggal", "Waktu"])
        # Write the data
        writer.writerows(results)

# Streamlit app
st.title("Detik.com Search Results Scraper")

# Get user input for search query and number of pages
search_query = st.sidebar.text_input("Enter the search query:")
num_pages_to_scrape = st.sidebar.number_input("Select the number of pages to scrape:", 1, 100, 1)

# Scrape Detik.com search results
if st.sidebar.button("Scrape Results"):
    search_results = scrape_detik_search(search_query, num_pages_to_scrape)
    with st.expander('Data Preview'):
        df= pd.DataFrame(search_results, columns= ["Judul", "Link", "Kategori", "Tanggal", "Waktu"])
        st.dataframe(df)

    # Save results to a CSV file
    csv_filename = f"{search_query.replace(' ', '_')}_detik_search_results.csv"
    save_to_csv(search_results, csv_filename)

    st.success(f"Results saved to {csv_filename}")