destiratnakomala commited on
Commit
7f7f8e9
1 Parent(s): b084bc5

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +87 -0
app.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ import csv
4
+ import re
5
+ from datetime import datetime
6
+ import streamlit as st
7
+ import pandas as pd
8
+ import time
9
+
10
+ def scrape_detik_search(query, num_pages):
11
+ base_url = "https://www.detik.com/search/searchnews"
12
+ all_results = []
13
+
14
+ for page in range(1, num_pages + 1):
15
+ params = {
16
+ "query": query,
17
+ "sortby": "time",
18
+ "page": page
19
+ }
20
+
21
+ response = requests.get(base_url, params=params)
22
+
23
+ if response.status_code == 200:
24
+ soup = BeautifulSoup(response.text, 'html.parser')
25
+ articles = soup.find_all('article')
26
+
27
+ for article in articles:
28
+ title = article.find('h2', class_='title').text.strip()
29
+
30
+ # Extract date and time using regular expressions
31
+ date_pattern = r'\d{1,2} [a-zA-Z]+ \d{4}'
32
+ time_pattern = r'\d{2}:\d{2}'
33
+
34
+ date_match = re.search(date_pattern, article.find('span', class_='date').text.strip())
35
+ time_match = re.search(time_pattern, article.find('span', class_='date').text.strip())
36
+
37
+ if date_match and time_match:
38
+ extracted_date = date_match.group()
39
+ extracted_time = time_match.group()
40
+
41
+ # # Convert the extracted date string to a datetime object
42
+ # datetime_object = datetime.strptime(extracted_date, '%d %b %Y')
43
+
44
+ # # Create a formatted date string
45
+ # formatted_date = datetime_object.strftime('%Y-%m-%d')
46
+
47
+ link = article.find('a')['href']
48
+ category = article.find('span', class_='category').text.strip()
49
+
50
+ all_results.append([title, link, category, extracted_date, extracted_time])
51
+
52
+ with st.spinner('Mohon Ditunggu...'):
53
+ time.sleep(1)
54
+
55
+ else:
56
+ st.write(f"Error {response.status_code}: Unable to retrieve data from Page {page}.")
57
+
58
+ return all_results
59
+
60
+ def save_to_csv(results, filename):
61
+ with open(filename, mode='w', newline='', encoding='utf-8') as file:
62
+ writer = csv.writer(file)
63
+ # Write the header
64
+ writer.writerow(["Judul", "Link", "Kategori", "Tanggal", "Waktu"])
65
+ # Write the data
66
+ writer.writerows(results)
67
+
68
+ # Streamlit app
69
+ st.title("Detik.com Search Results Scraper")
70
+
71
+ # Get user input for search query and number of pages
72
+ search_query = st.sidebar.text_input("Enter the search query:")
73
+ num_pages_to_scrape = st.sidebar.number_input("Select the number of pages to scrape:", 1, 100, 1)
74
+
75
+ # Scrape Detik.com search results
76
+ if st.sidebar.button("Scrape Results"):
77
+ search_results = scrape_detik_search(search_query, num_pages_to_scrape)
78
+ with st.expander('Data Preview'):
79
+ df= pd.DataFrame(search_results, columns= ["Judul", "Link", "Kategori", "Tanggal", "Waktu"])
80
+ st.dataframe(df)
81
+
82
+ # Save results to a CSV file
83
+ csv_filename = f"{search_query.replace(' ', '_')}_detik_search_results.csv"
84
+ save_to_csv(search_results, csv_filename)
85
+
86
+ st.success(f"Results saved to {csv_filename}")
87
+