Spaces:
Sleeping
Sleeping
naufalnashif
commited on
Commit
•
abb9b5c
1
Parent(s):
2385ba5
Update app.py
Browse files
app.py
CHANGED
@@ -20,7 +20,7 @@ from nltk.corpus import stopwords
|
|
20 |
#---------------------------------------------------Scraping Function----------------------------------------------------------------------
|
21 |
|
22 |
@st.cache_data
|
23 |
-
def scrape_cnbc_data(query, date, jumlah):
|
24 |
data = []
|
25 |
page = 1
|
26 |
progress_text = "Scraping in progress. Please wait."
|
@@ -88,7 +88,7 @@ def scrape_cnbc_data(query, date, jumlah):
|
|
88 |
|
89 |
|
90 |
@st.cache_data
|
91 |
-
def scrape_detik_news(query, date, jumlah):
|
92 |
start_page = 1
|
93 |
base_url = "https://www.detik.com/search/searchall"
|
94 |
data = []
|
@@ -159,7 +159,7 @@ def scrape_detik_news(query, date, jumlah):
|
|
159 |
return data
|
160 |
|
161 |
@st.cache_data
|
162 |
-
def scrape_viva_data(query, date, jumlah):
|
163 |
data = []
|
164 |
page = 1
|
165 |
progress_text = "Scraping in progress. Please wait."
|
@@ -227,20 +227,51 @@ def scrape_viva_data(query, date, jumlah):
|
|
227 |
return data
|
228 |
|
229 |
@st.cache_data
|
230 |
-
def scrape_tempo_data(query, date, jumlah):
|
231 |
data = []
|
232 |
domain = 1
|
233 |
max_domains = 5
|
234 |
progress_text = "Scraping in progress. Please wait."
|
235 |
my_bar = st.progress(len(data), text=progress_text)
|
236 |
# List of channel values
|
237 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
238 |
seen_titles = set() # Set untuk melacak judul berita yang sudah muncul
|
239 |
|
240 |
try:
|
241 |
while len(data) < jumlah and domain <= max_domains:
|
242 |
-
for kanal in channels:
|
243 |
-
url = f"https://www.tempo.co/search?waktu={waktu}&kanal={
|
244 |
user_agents = [
|
245 |
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36",
|
246 |
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/17.17134",
|
@@ -271,6 +302,7 @@ def scrape_tempo_data(query, date, jumlah):
|
|
271 |
date = date_before.replace(category, '')
|
272 |
data.append({
|
273 |
'category': category,
|
|
|
274 |
'date': date,
|
275 |
'judul-berita': title,
|
276 |
'link-berita': link,
|
@@ -431,8 +463,8 @@ def eksplorasi_data(selected_options, results, colormap, words):
|
|
431 |
|
432 |
st.pyplot(fig)
|
433 |
@st.cache_data
|
434 |
-
def scrape_and_explore_data(_scrape_function, query, date, jumlah, selected_options, colormap, words):
|
435 |
-
data_df = _scrape_function(query, date, jumlah)
|
436 |
hidden_data = data_df
|
437 |
scraping_done = True
|
438 |
results = preprocessing_data(hidden_data)
|
@@ -450,12 +482,13 @@ with st.expander("Scraping Settings :"):
|
|
450 |
selected_site = st.selectbox("Pilih Situs Web :", ["CNBC Indonesia", "Detik.com", "Viva.co.id", "Tempo.co", "Liputan6.com"])
|
451 |
if selected_site == "Tempo.co":
|
452 |
waktu = st.selectbox("Pilih Rentang Waktu :", ["1tahun", "1bulan", "1minggu", "1hari", "6jam"])
|
|
|
453 |
query = st.text_input("Masukkan Query :").replace(' ', '+')
|
454 |
|
455 |
jumlah = st.number_input("Masukkan Estimasi Banyak Data :", min_value = 1, step = 1, placeholder="Type a number...")
|
456 |
date = date.today()
|
457 |
download_format = st.selectbox("Pilih Format Unduhan :", ["XLSX", "CSV", "JSON", "TXT"])
|
458 |
-
|
459 |
with st.expander("Preference Settings :"):
|
460 |
selected_options = st.multiselect(
|
461 |
'Pilih tampilan:',
|
@@ -484,21 +517,21 @@ if st.button("Mulai Scraping"):
|
|
484 |
else:
|
485 |
# CNBC Indonesia
|
486 |
if selected_site == "CNBC Indonesia":
|
487 |
-
hidden_data, scraping_done, results = scrape_and_explore_data(scrape_cnbc_data, query, date.strftime("%Y/%m/%d"), jumlah, selected_options, colormap, words)
|
488 |
|
489 |
# Detik.com
|
490 |
elif selected_site == "Detik.com":
|
491 |
-
hidden_data, scraping_done, results = scrape_and_explore_data(scrape_detik_news, query, date, jumlah, selected_options, colormap, words)
|
492 |
|
493 |
# Viva.co.id
|
494 |
elif selected_site == "Viva.co.id":
|
495 |
st.warning("Masih dalam penegmbangan, silahkan gunakan situs yang lain.")
|
496 |
-
hidden_data, scraping_done, results = scrape_and_explore_data(scrape_viva_data, query, date, jumlah, selected_options, colormap, words)
|
497 |
|
498 |
# Tempo.co
|
499 |
elif selected_site == "Tempo.co":
|
500 |
st.warning("Masih dalam penegmbangan, silahkan gunakan situs yang lain.")
|
501 |
-
hidden_data, scraping_done, results = scrape_and_explore_data(scrape_tempo_data, query, waktu, jumlah, selected_options, colormap, words)
|
502 |
|
503 |
# Liputan6.com
|
504 |
elif selected_site == "Liputan6.com":
|
|
|
20 |
#---------------------------------------------------Scraping Function----------------------------------------------------------------------
|
21 |
|
22 |
@st.cache_data
|
23 |
+
def scrape_cnbc_data(query, date, jumlah, param_kosong):
|
24 |
data = []
|
25 |
page = 1
|
26 |
progress_text = "Scraping in progress. Please wait."
|
|
|
88 |
|
89 |
|
90 |
@st.cache_data
|
91 |
+
def scrape_detik_news(query, date, jumlah, param_kosong):
|
92 |
start_page = 1
|
93 |
base_url = "https://www.detik.com/search/searchall"
|
94 |
data = []
|
|
|
159 |
return data
|
160 |
|
161 |
@st.cache_data
|
162 |
+
def scrape_viva_data(query, date, jumlah, param_kosong):
|
163 |
data = []
|
164 |
page = 1
|
165 |
progress_text = "Scraping in progress. Please wait."
|
|
|
227 |
return data
|
228 |
|
229 |
@st.cache_data
|
230 |
+
def scrape_tempo_data(query, date, jumlah, selected_channel):
|
231 |
data = []
|
232 |
domain = 1
|
233 |
max_domains = 5
|
234 |
progress_text = "Scraping in progress. Please wait."
|
235 |
my_bar = st.progress(len(data), text=progress_text)
|
236 |
# List of channel values
|
237 |
+
default_channels = {
|
238 |
+
'All': '',
|
239 |
+
'Nasional': '20',
|
240 |
+
'Metro': '19',
|
241 |
+
'Dunia': '5',
|
242 |
+
'Bisnis': '1',
|
243 |
+
'Bola': '21',
|
244 |
+
'Sport': '33',
|
245 |
+
'Gaya': '9',
|
246 |
+
'Seleb': '32',
|
247 |
+
'Cantik': '2',
|
248 |
+
'Tekno': '34',
|
249 |
+
'Otomotif': '23',
|
250 |
+
'Travel': '35',
|
251 |
+
'Blog': '43',
|
252 |
+
'Difabel': '44',
|
253 |
+
'Ramadan': '30',
|
254 |
+
'Kolom': '14',
|
255 |
+
'Fokus': '8',
|
256 |
+
'Creative Lab': '47',
|
257 |
+
'Event': '62',
|
258 |
+
'Data': '65',
|
259 |
+
'Cek Fakta': '66',
|
260 |
+
'Newsletter': '63',
|
261 |
+
'Inforial': '12'
|
262 |
+
}
|
263 |
+
|
264 |
+
# Ubah channels sesuai dengan selected_channel
|
265 |
+
if selected_channel != 'Defaults' and selected_channel in default_channels:
|
266 |
+
channels = {selected_channel: default_channels[selected_channel]}
|
267 |
+
else:
|
268 |
+
channels = default_channels
|
269 |
seen_titles = set() # Set untuk melacak judul berita yang sudah muncul
|
270 |
|
271 |
try:
|
272 |
while len(data) < jumlah and domain <= max_domains:
|
273 |
+
for kanal, value in channels.items():
|
274 |
+
url = f"https://www.tempo.co/search?waktu={waktu}&kanal={value}&subkanal=&domain={domain}&q={query}"
|
275 |
user_agents = [
|
276 |
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36",
|
277 |
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/17.17134",
|
|
|
302 |
date = date_before.replace(category, '')
|
303 |
data.append({
|
304 |
'category': category,
|
305 |
+
'kanal' : kanal,
|
306 |
'date': date,
|
307 |
'judul-berita': title,
|
308 |
'link-berita': link,
|
|
|
463 |
|
464 |
st.pyplot(fig)
|
465 |
@st.cache_data
|
466 |
+
def scrape_and_explore_data(_scrape_function, query, date, jumlah, selected_options, colormap, words, param):
|
467 |
+
data_df = _scrape_function(query, date, jumlah, param)
|
468 |
hidden_data = data_df
|
469 |
scraping_done = True
|
470 |
results = preprocessing_data(hidden_data)
|
|
|
482 |
selected_site = st.selectbox("Pilih Situs Web :", ["CNBC Indonesia", "Detik.com", "Viva.co.id", "Tempo.co", "Liputan6.com"])
|
483 |
if selected_site == "Tempo.co":
|
484 |
waktu = st.selectbox("Pilih Rentang Waktu :", ["1tahun", "1bulan", "1minggu", "1hari", "6jam"])
|
485 |
+
selected_channel = st.selectbox("Pilih Kanal :", ['Defaults','All', 'Nasional', 'Metro', 'Dunia', 'Bisnis', 'Bola', 'Sport', 'Gaya', 'Seleb', 'Cantik', 'Tekno', 'Otomotif', 'Travel', 'Blog', 'Difabel', 'Ramadan', 'Kolom', 'Fokus', 'Creative Lab', 'Event', 'Data', 'Cek Fakta', 'Newsletter', 'Inforial'])
|
486 |
query = st.text_input("Masukkan Query :").replace(' ', '+')
|
487 |
|
488 |
jumlah = st.number_input("Masukkan Estimasi Banyak Data :", min_value = 1, step = 1, placeholder="Type a number...")
|
489 |
date = date.today()
|
490 |
download_format = st.selectbox("Pilih Format Unduhan :", ["XLSX", "CSV", "JSON", "TXT"])
|
491 |
+
param_kosong = []
|
492 |
with st.expander("Preference Settings :"):
|
493 |
selected_options = st.multiselect(
|
494 |
'Pilih tampilan:',
|
|
|
517 |
else:
|
518 |
# CNBC Indonesia
|
519 |
if selected_site == "CNBC Indonesia":
|
520 |
+
hidden_data, scraping_done, results = scrape_and_explore_data(scrape_cnbc_data, query, date.strftime("%Y/%m/%d"), jumlah, selected_options, colormap, words, param_kosong)
|
521 |
|
522 |
# Detik.com
|
523 |
elif selected_site == "Detik.com":
|
524 |
+
hidden_data, scraping_done, results = scrape_and_explore_data(scrape_detik_news, query, date, jumlah, selected_options, colormap, words, param_kosong)
|
525 |
|
526 |
# Viva.co.id
|
527 |
elif selected_site == "Viva.co.id":
|
528 |
st.warning("Masih dalam penegmbangan, silahkan gunakan situs yang lain.")
|
529 |
+
hidden_data, scraping_done, results = scrape_and_explore_data(scrape_viva_data, query, date, jumlah, selected_options, colormap, words, param_kosong)
|
530 |
|
531 |
# Tempo.co
|
532 |
elif selected_site == "Tempo.co":
|
533 |
st.warning("Masih dalam penegmbangan, silahkan gunakan situs yang lain.")
|
534 |
+
hidden_data, scraping_done, results = scrape_and_explore_data(scrape_tempo_data, query, waktu, jumlah, selected_options, colormap, words, selected_channel)
|
535 |
|
536 |
# Liputan6.com
|
537 |
elif selected_site == "Liputan6.com":
|