Spaces:

naufalnashif
/

scraping-news-headline

Sleeping

App Files Files Community

naufalnashif commited on Nov 14, 2023

Commit

2f530ac

•

1 Parent(s): 209be85

Update app.py

Browse files

Files changed (1) hide show

app.py +65 -0

app.py CHANGED Viewed

@@ -156,6 +156,71 @@ def scrape_detik_news(query, date, jumlah):
     time.sleep(1)
     my_bar.empty()
     return data
 #---------------------------------------------------Data Cleaning (RegEx)----------------------------------------------------------------------
 def clean_text(text):

     time.sleep(1)
     my_bar.empty()
     return data
+@st.cache_data
+def scrape_viva_data(query, date, jumlah):
+    data = []
+    page = 1
+    progress_text = "Scraping in progress. Please wait."
+    my_bar = st.progress(len(data), text=progress_text)
+    while len (data) < jumlah :
+        try :
+            url = f"https://www.viva.co.id/search?q={query}"
+            user_agents = [
+                "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36",
+                "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/17.17134",
+                "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1.2 Safari/605.1.15",
+            ]
+            # Mendapatkan User-Agent acak
+            random_user_agent = random.choice(user_agents)
+            # Menggunakan User-Agent dalam permintaan HTTP
+            headers = {
+                "User-Agent": random_user_agent,
+                "Accept-Language": "en-US,en;q=0.5"
+            }
+            timeout = 10
+            response = requests.get(url, headers=headers, timeout = timeout)
+            soup = BeautifulSoup(response.content, 'html.parser')
+            articles = soup.find_all('div', class_ = 'article-list-container')
+            if not articles:
+                break
+            for article in articles:
+                title = article.find('h2').text.strip()
+                link = article.find('a')['href']
+                category = article.find('h3').text.strip()
+                date = article.find('div', class_='article-list-date content_center').text.strip()
+                data.append({
+                    'category': category,
+                    'date': date,
+                    'judul-berita': title,
+                    'link-berita': link,
+                })
+                if len(data) > jumlah:
+                    data = data[:jumlah]
+                break
+                prop = min(len(data) / jumlah, 1)
+                my_bar.progress(prop, text=progress_text)
+            page += 1
+        except requests.exceptions.RequestException as e:
+            st.error(f"An error occurred: {e}")
+            break
+    time.sleep(1)
+    my_bar.empty()
+    return data
 #---------------------------------------------------Data Cleaning (RegEx)----------------------------------------------------------------------
 def clean_text(text):