Spaces:

naufalnashif
/

scraping-news-headline

Sleeping

App Files Files Community

naufalnashif commited on Nov 15, 2023

Commit

abb9b5c

•

1 Parent(s): 2385ba5

Update app.py

Browse files

Files changed (1) hide show

app.py +47 -14

app.py CHANGED Viewed

@@ -20,7 +20,7 @@ from nltk.corpus import stopwords
 #---------------------------------------------------Scraping Function----------------------------------------------------------------------
 @st.cache_data
-def scrape_cnbc_data(query, date, jumlah):
     data = []
     page = 1
     progress_text = "Scraping in progress. Please wait."
@@ -88,7 +88,7 @@ def scrape_cnbc_data(query, date, jumlah):
 @st.cache_data
-def scrape_detik_news(query, date, jumlah):
     start_page = 1
     base_url = "https://www.detik.com/search/searchall"
     data = []
@@ -159,7 +159,7 @@ def scrape_detik_news(query, date, jumlah):
     return data
 @st.cache_data
-def scrape_viva_data(query, date, jumlah):
     data = []
     page = 1
     progress_text = "Scraping in progress. Please wait."
@@ -227,20 +227,51 @@ def scrape_viva_data(query, date, jumlah):
     return data
 @st.cache_data
-def scrape_tempo_data(query, date, jumlah):
     data = []
     domain = 1
     max_domains = 5
     progress_text = "Scraping in progress. Please wait."
     my_bar = st.progress(len(data), text=progress_text)
     # List of channel values
-    channels = ["","20", "19", "5", "1", "21", "33", "9", "32", "2", "34", "23", "35", "43", "44", "30", "14", "8", "47", "62", "65", "66", "63", "12"]
     seen_titles = set()  # Set untuk melacak judul berita yang sudah muncul
     try:
         while len(data) < jumlah and domain <= max_domains:
-            for kanal in channels:
-                url = f"https://www.tempo.co/search?waktu={waktu}&kanal={kanal}&subkanal=&domain={domain}&q={query}"
                 user_agents = [
                     "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36",
                     "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/17.17134",
@@ -271,6 +302,7 @@ def scrape_tempo_data(query, date, jumlah):
                         date = date_before.replace(category, '')
                         data.append({
                             'category': category,
                             'date': date,
                             'judul-berita': title,
                             'link-berita': link,
@@ -431,8 +463,8 @@ def eksplorasi_data(selected_options, results, colormap, words):
                 st.pyplot(fig)
 @st.cache_data
-def scrape_and_explore_data(_scrape_function, query, date, jumlah, selected_options, colormap, words):
-    data_df = _scrape_function(query, date, jumlah)
     hidden_data = data_df
     scraping_done = True
     results = preprocessing_data(hidden_data)
@@ -450,12 +482,13 @@ with st.expander("Scraping Settings :"):
     selected_site = st.selectbox("Pilih Situs Web :", ["CNBC Indonesia", "Detik.com", "Viva.co.id", "Tempo.co", "Liputan6.com"])
     if selected_site == "Tempo.co":
         waktu = st.selectbox("Pilih Rentang Waktu :", ["1tahun", "1bulan", "1minggu", "1hari", "6jam"])
     query = st.text_input("Masukkan Query :").replace(' ', '+')
     jumlah = st.number_input("Masukkan Estimasi Banyak Data :", min_value = 1, step = 1, placeholder="Type a number...")
     date = date.today()
     download_format = st.selectbox("Pilih Format Unduhan :", ["XLSX", "CSV", "JSON", "TXT"])
 with st.expander("Preference Settings :"):
     selected_options = st.multiselect(
         'Pilih tampilan:',
@@ -484,21 +517,21 @@ if st.button("Mulai Scraping"):
     else:
         # CNBC Indonesia
         if selected_site == "CNBC Indonesia":
-            hidden_data, scraping_done, results = scrape_and_explore_data(scrape_cnbc_data, query, date.strftime("%Y/%m/%d"), jumlah, selected_options, colormap, words)
         # Detik.com
         elif selected_site == "Detik.com":
-            hidden_data, scraping_done, results = scrape_and_explore_data(scrape_detik_news, query, date, jumlah, selected_options, colormap, words)
         # Viva.co.id
         elif selected_site == "Viva.co.id":
             st.warning("Masih dalam penegmbangan, silahkan gunakan situs yang lain.")
-            hidden_data, scraping_done, results = scrape_and_explore_data(scrape_viva_data, query, date, jumlah, selected_options, colormap, words)
         # Tempo.co
         elif selected_site == "Tempo.co":
             st.warning("Masih dalam penegmbangan, silahkan gunakan situs yang lain.")
-            hidden_data, scraping_done, results = scrape_and_explore_data(scrape_tempo_data, query, waktu, jumlah, selected_options, colormap, words)
         # Liputan6.com
         elif selected_site == "Liputan6.com":

 #---------------------------------------------------Scraping Function----------------------------------------------------------------------
 @st.cache_data
+def scrape_cnbc_data(query, date, jumlah, param_kosong):
     data = []
     page = 1
     progress_text = "Scraping in progress. Please wait."
 @st.cache_data
+def scrape_detik_news(query, date, jumlah, param_kosong):
     start_page = 1
     base_url = "https://www.detik.com/search/searchall"
     data = []
     return data
 @st.cache_data
+def scrape_viva_data(query, date, jumlah, param_kosong):
     data = []
     page = 1
     progress_text = "Scraping in progress. Please wait."
     return data
 @st.cache_data
+def scrape_tempo_data(query, date, jumlah, selected_channel):
     data = []
     domain = 1
     max_domains = 5
     progress_text = "Scraping in progress. Please wait."
     my_bar = st.progress(len(data), text=progress_text)
     # List of channel values
+    default_channels = {
+        'All': '',
+        'Nasional': '20',
+        'Metro': '19',
+        'Dunia': '5',
+        'Bisnis': '1',
+        'Bola': '21',
+        'Sport': '33',
+        'Gaya': '9',
+        'Seleb': '32',
+        'Cantik': '2',
+        'Tekno': '34',
+        'Otomotif': '23',
+        'Travel': '35',
+        'Blog': '43',
+        'Difabel': '44',
+        'Ramadan': '30',
+        'Kolom': '14',
+        'Fokus': '8',
+        'Creative Lab': '47',
+        'Event': '62',
+        'Data': '65',
+        'Cek Fakta': '66',
+        'Newsletter': '63',
+        'Inforial': '12'
+    }
+    # Ubah channels sesuai dengan selected_channel
+    if selected_channel != 'Defaults' and selected_channel in default_channels:
+        channels = {selected_channel: default_channels[selected_channel]}
+    else:
+        channels = default_channels
     seen_titles = set()  # Set untuk melacak judul berita yang sudah muncul
     try:
         while len(data) < jumlah and domain <= max_domains:
+            for kanal, value in channels.items():
+                url = f"https://www.tempo.co/search?waktu={waktu}&kanal={value}&subkanal=&domain={domain}&q={query}"
                 user_agents = [
                     "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36",
                     "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/17.17134",
                         date = date_before.replace(category, '')
                         data.append({
                             'category': category,
+                            'kanal' : kanal,
                             'date': date,
                             'judul-berita': title,
                             'link-berita': link,
                 st.pyplot(fig)
 @st.cache_data
+def scrape_and_explore_data(_scrape_function, query, date, jumlah, selected_options, colormap, words, param):
+    data_df = _scrape_function(query, date, jumlah, param)
     hidden_data = data_df
     scraping_done = True
     results = preprocessing_data(hidden_data)
     selected_site = st.selectbox("Pilih Situs Web :", ["CNBC Indonesia", "Detik.com", "Viva.co.id", "Tempo.co", "Liputan6.com"])
     if selected_site == "Tempo.co":
         waktu = st.selectbox("Pilih Rentang Waktu :", ["1tahun", "1bulan", "1minggu", "1hari", "6jam"])
+        selected_channel = st.selectbox("Pilih Kanal :", ['Defaults','All', 'Nasional', 'Metro', 'Dunia', 'Bisnis', 'Bola', 'Sport', 'Gaya', 'Seleb', 'Cantik', 'Tekno', 'Otomotif', 'Travel', 'Blog', 'Difabel', 'Ramadan', 'Kolom', 'Fokus', 'Creative Lab', 'Event', 'Data', 'Cek Fakta', 'Newsletter', 'Inforial'])
     query = st.text_input("Masukkan Query :").replace(' ', '+')
     jumlah = st.number_input("Masukkan Estimasi Banyak Data :", min_value = 1, step = 1, placeholder="Type a number...")
     date = date.today()
     download_format = st.selectbox("Pilih Format Unduhan :", ["XLSX", "CSV", "JSON", "TXT"])
+param_kosong = []
 with st.expander("Preference Settings :"):
     selected_options = st.multiselect(
         'Pilih tampilan:',
     else:
         # CNBC Indonesia
         if selected_site == "CNBC Indonesia":
+            hidden_data, scraping_done, results = scrape_and_explore_data(scrape_cnbc_data, query, date.strftime("%Y/%m/%d"), jumlah, selected_options, colormap, words, param_kosong)
         # Detik.com
         elif selected_site == "Detik.com":
+            hidden_data, scraping_done, results = scrape_and_explore_data(scrape_detik_news, query, date, jumlah, selected_options, colormap, words, param_kosong)
         # Viva.co.id
         elif selected_site == "Viva.co.id":
             st.warning("Masih dalam penegmbangan, silahkan gunakan situs yang lain.")
+            hidden_data, scraping_done, results = scrape_and_explore_data(scrape_viva_data, query, date, jumlah, selected_options, colormap, words, param_kosong)
         # Tempo.co
         elif selected_site == "Tempo.co":
             st.warning("Masih dalam penegmbangan, silahkan gunakan situs yang lain.")
+            hidden_data, scraping_done, results = scrape_and_explore_data(scrape_tempo_data, query, waktu, jumlah, selected_options, colormap, words, selected_channel)
         # Liputan6.com
         elif selected_site == "Liputan6.com":