naufalnashif commited on
Commit
abb9b5c
1 Parent(s): 2385ba5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +47 -14
app.py CHANGED
@@ -20,7 +20,7 @@ from nltk.corpus import stopwords
20
  #---------------------------------------------------Scraping Function----------------------------------------------------------------------
21
 
22
  @st.cache_data
23
- def scrape_cnbc_data(query, date, jumlah):
24
  data = []
25
  page = 1
26
  progress_text = "Scraping in progress. Please wait."
@@ -88,7 +88,7 @@ def scrape_cnbc_data(query, date, jumlah):
88
 
89
 
90
  @st.cache_data
91
- def scrape_detik_news(query, date, jumlah):
92
  start_page = 1
93
  base_url = "https://www.detik.com/search/searchall"
94
  data = []
@@ -159,7 +159,7 @@ def scrape_detik_news(query, date, jumlah):
159
  return data
160
 
161
  @st.cache_data
162
- def scrape_viva_data(query, date, jumlah):
163
  data = []
164
  page = 1
165
  progress_text = "Scraping in progress. Please wait."
@@ -227,20 +227,51 @@ def scrape_viva_data(query, date, jumlah):
227
  return data
228
 
229
  @st.cache_data
230
- def scrape_tempo_data(query, date, jumlah):
231
  data = []
232
  domain = 1
233
  max_domains = 5
234
  progress_text = "Scraping in progress. Please wait."
235
  my_bar = st.progress(len(data), text=progress_text)
236
  # List of channel values
237
- channels = ["","20", "19", "5", "1", "21", "33", "9", "32", "2", "34", "23", "35", "43", "44", "30", "14", "8", "47", "62", "65", "66", "63", "12"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
238
  seen_titles = set() # Set untuk melacak judul berita yang sudah muncul
239
 
240
  try:
241
  while len(data) < jumlah and domain <= max_domains:
242
- for kanal in channels:
243
- url = f"https://www.tempo.co/search?waktu={waktu}&kanal={kanal}&subkanal=&domain={domain}&q={query}"
244
  user_agents = [
245
  "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36",
246
  "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/17.17134",
@@ -271,6 +302,7 @@ def scrape_tempo_data(query, date, jumlah):
271
  date = date_before.replace(category, '')
272
  data.append({
273
  'category': category,
 
274
  'date': date,
275
  'judul-berita': title,
276
  'link-berita': link,
@@ -431,8 +463,8 @@ def eksplorasi_data(selected_options, results, colormap, words):
431
 
432
  st.pyplot(fig)
433
  @st.cache_data
434
- def scrape_and_explore_data(_scrape_function, query, date, jumlah, selected_options, colormap, words):
435
- data_df = _scrape_function(query, date, jumlah)
436
  hidden_data = data_df
437
  scraping_done = True
438
  results = preprocessing_data(hidden_data)
@@ -450,12 +482,13 @@ with st.expander("Scraping Settings :"):
450
  selected_site = st.selectbox("Pilih Situs Web :", ["CNBC Indonesia", "Detik.com", "Viva.co.id", "Tempo.co", "Liputan6.com"])
451
  if selected_site == "Tempo.co":
452
  waktu = st.selectbox("Pilih Rentang Waktu :", ["1tahun", "1bulan", "1minggu", "1hari", "6jam"])
 
453
  query = st.text_input("Masukkan Query :").replace(' ', '+')
454
 
455
  jumlah = st.number_input("Masukkan Estimasi Banyak Data :", min_value = 1, step = 1, placeholder="Type a number...")
456
  date = date.today()
457
  download_format = st.selectbox("Pilih Format Unduhan :", ["XLSX", "CSV", "JSON", "TXT"])
458
-
459
  with st.expander("Preference Settings :"):
460
  selected_options = st.multiselect(
461
  'Pilih tampilan:',
@@ -484,21 +517,21 @@ if st.button("Mulai Scraping"):
484
  else:
485
  # CNBC Indonesia
486
  if selected_site == "CNBC Indonesia":
487
- hidden_data, scraping_done, results = scrape_and_explore_data(scrape_cnbc_data, query, date.strftime("%Y/%m/%d"), jumlah, selected_options, colormap, words)
488
 
489
  # Detik.com
490
  elif selected_site == "Detik.com":
491
- hidden_data, scraping_done, results = scrape_and_explore_data(scrape_detik_news, query, date, jumlah, selected_options, colormap, words)
492
 
493
  # Viva.co.id
494
  elif selected_site == "Viva.co.id":
495
  st.warning("Masih dalam penegmbangan, silahkan gunakan situs yang lain.")
496
- hidden_data, scraping_done, results = scrape_and_explore_data(scrape_viva_data, query, date, jumlah, selected_options, colormap, words)
497
 
498
  # Tempo.co
499
  elif selected_site == "Tempo.co":
500
  st.warning("Masih dalam penegmbangan, silahkan gunakan situs yang lain.")
501
- hidden_data, scraping_done, results = scrape_and_explore_data(scrape_tempo_data, query, waktu, jumlah, selected_options, colormap, words)
502
 
503
  # Liputan6.com
504
  elif selected_site == "Liputan6.com":
 
20
  #---------------------------------------------------Scraping Function----------------------------------------------------------------------
21
 
22
  @st.cache_data
23
+ def scrape_cnbc_data(query, date, jumlah, param_kosong):
24
  data = []
25
  page = 1
26
  progress_text = "Scraping in progress. Please wait."
 
88
 
89
 
90
  @st.cache_data
91
+ def scrape_detik_news(query, date, jumlah, param_kosong):
92
  start_page = 1
93
  base_url = "https://www.detik.com/search/searchall"
94
  data = []
 
159
  return data
160
 
161
  @st.cache_data
162
+ def scrape_viva_data(query, date, jumlah, param_kosong):
163
  data = []
164
  page = 1
165
  progress_text = "Scraping in progress. Please wait."
 
227
  return data
228
 
229
  @st.cache_data
230
+ def scrape_tempo_data(query, date, jumlah, selected_channel):
231
  data = []
232
  domain = 1
233
  max_domains = 5
234
  progress_text = "Scraping in progress. Please wait."
235
  my_bar = st.progress(len(data), text=progress_text)
236
  # List of channel values
237
+ default_channels = {
238
+ 'All': '',
239
+ 'Nasional': '20',
240
+ 'Metro': '19',
241
+ 'Dunia': '5',
242
+ 'Bisnis': '1',
243
+ 'Bola': '21',
244
+ 'Sport': '33',
245
+ 'Gaya': '9',
246
+ 'Seleb': '32',
247
+ 'Cantik': '2',
248
+ 'Tekno': '34',
249
+ 'Otomotif': '23',
250
+ 'Travel': '35',
251
+ 'Blog': '43',
252
+ 'Difabel': '44',
253
+ 'Ramadan': '30',
254
+ 'Kolom': '14',
255
+ 'Fokus': '8',
256
+ 'Creative Lab': '47',
257
+ 'Event': '62',
258
+ 'Data': '65',
259
+ 'Cek Fakta': '66',
260
+ 'Newsletter': '63',
261
+ 'Inforial': '12'
262
+ }
263
+
264
+ # Ubah channels sesuai dengan selected_channel
265
+ if selected_channel != 'Defaults' and selected_channel in default_channels:
266
+ channels = {selected_channel: default_channels[selected_channel]}
267
+ else:
268
+ channels = default_channels
269
  seen_titles = set() # Set untuk melacak judul berita yang sudah muncul
270
 
271
  try:
272
  while len(data) < jumlah and domain <= max_domains:
273
+ for kanal, value in channels.items():
274
+ url = f"https://www.tempo.co/search?waktu={waktu}&kanal={value}&subkanal=&domain={domain}&q={query}"
275
  user_agents = [
276
  "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36",
277
  "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/17.17134",
 
302
  date = date_before.replace(category, '')
303
  data.append({
304
  'category': category,
305
+ 'kanal' : kanal,
306
  'date': date,
307
  'judul-berita': title,
308
  'link-berita': link,
 
463
 
464
  st.pyplot(fig)
465
  @st.cache_data
466
+ def scrape_and_explore_data(_scrape_function, query, date, jumlah, selected_options, colormap, words, param):
467
+ data_df = _scrape_function(query, date, jumlah, param)
468
  hidden_data = data_df
469
  scraping_done = True
470
  results = preprocessing_data(hidden_data)
 
482
  selected_site = st.selectbox("Pilih Situs Web :", ["CNBC Indonesia", "Detik.com", "Viva.co.id", "Tempo.co", "Liputan6.com"])
483
  if selected_site == "Tempo.co":
484
  waktu = st.selectbox("Pilih Rentang Waktu :", ["1tahun", "1bulan", "1minggu", "1hari", "6jam"])
485
+ selected_channel = st.selectbox("Pilih Kanal :", ['Defaults','All', 'Nasional', 'Metro', 'Dunia', 'Bisnis', 'Bola', 'Sport', 'Gaya', 'Seleb', 'Cantik', 'Tekno', 'Otomotif', 'Travel', 'Blog', 'Difabel', 'Ramadan', 'Kolom', 'Fokus', 'Creative Lab', 'Event', 'Data', 'Cek Fakta', 'Newsletter', 'Inforial'])
486
  query = st.text_input("Masukkan Query :").replace(' ', '+')
487
 
488
  jumlah = st.number_input("Masukkan Estimasi Banyak Data :", min_value = 1, step = 1, placeholder="Type a number...")
489
  date = date.today()
490
  download_format = st.selectbox("Pilih Format Unduhan :", ["XLSX", "CSV", "JSON", "TXT"])
491
+ param_kosong = []
492
  with st.expander("Preference Settings :"):
493
  selected_options = st.multiselect(
494
  'Pilih tampilan:',
 
517
  else:
518
  # CNBC Indonesia
519
  if selected_site == "CNBC Indonesia":
520
+ hidden_data, scraping_done, results = scrape_and_explore_data(scrape_cnbc_data, query, date.strftime("%Y/%m/%d"), jumlah, selected_options, colormap, words, param_kosong)
521
 
522
  # Detik.com
523
  elif selected_site == "Detik.com":
524
+ hidden_data, scraping_done, results = scrape_and_explore_data(scrape_detik_news, query, date, jumlah, selected_options, colormap, words, param_kosong)
525
 
526
  # Viva.co.id
527
  elif selected_site == "Viva.co.id":
528
  st.warning("Masih dalam penegmbangan, silahkan gunakan situs yang lain.")
529
+ hidden_data, scraping_done, results = scrape_and_explore_data(scrape_viva_data, query, date, jumlah, selected_options, colormap, words, param_kosong)
530
 
531
  # Tempo.co
532
  elif selected_site == "Tempo.co":
533
  st.warning("Masih dalam penegmbangan, silahkan gunakan situs yang lain.")
534
+ hidden_data, scraping_done, results = scrape_and_explore_data(scrape_tempo_data, query, waktu, jumlah, selected_options, colormap, words, selected_channel)
535
 
536
  # Liputan6.com
537
  elif selected_site == "Liputan6.com":