naufalnashif commited on
Commit
2f530ac
1 Parent(s): 209be85

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +65 -0
app.py CHANGED
@@ -156,6 +156,71 @@ def scrape_detik_news(query, date, jumlah):
156
  time.sleep(1)
157
  my_bar.empty()
158
  return data
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
159
  #---------------------------------------------------Data Cleaning (RegEx)----------------------------------------------------------------------
160
 
161
  def clean_text(text):
 
156
  time.sleep(1)
157
  my_bar.empty()
158
  return data
159
+
160
+ @st.cache_data
161
+ def scrape_viva_data(query, date, jumlah):
162
+ data = []
163
+ page = 1
164
+ progress_text = "Scraping in progress. Please wait."
165
+ my_bar = st.progress(len(data), text=progress_text)
166
+
167
+
168
+ while len (data) < jumlah :
169
+ try :
170
+
171
+ url = f"https://www.viva.co.id/search?q={query}"
172
+ user_agents = [
173
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36",
174
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/17.17134",
175
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1.2 Safari/605.1.15",
176
+ ]
177
+
178
+ # Mendapatkan User-Agent acak
179
+ random_user_agent = random.choice(user_agents)
180
+
181
+ # Menggunakan User-Agent dalam permintaan HTTP
182
+ headers = {
183
+ "User-Agent": random_user_agent,
184
+ "Accept-Language": "en-US,en;q=0.5"
185
+ }
186
+ timeout = 10
187
+ response = requests.get(url, headers=headers, timeout = timeout)
188
+ soup = BeautifulSoup(response.content, 'html.parser')
189
+
190
+ articles = soup.find_all('div', class_ = 'article-list-container')
191
+
192
+ if not articles:
193
+ break
194
+
195
+ for article in articles:
196
+ title = article.find('h2').text.strip()
197
+ link = article.find('a')['href']
198
+ category = article.find('h3').text.strip()
199
+ date = article.find('div', class_='article-list-date content_center').text.strip()
200
+
201
+ data.append({
202
+ 'category': category,
203
+ 'date': date,
204
+ 'judul-berita': title,
205
+ 'link-berita': link,
206
+ })
207
+ if len(data) > jumlah:
208
+ data = data[:jumlah]
209
+ break
210
+
211
+ prop = min(len(data) / jumlah, 1)
212
+ my_bar.progress(prop, text=progress_text)
213
+ page += 1
214
+ except requests.exceptions.RequestException as e:
215
+ st.error(f"An error occurred: {e}")
216
+ break
217
+
218
+
219
+
220
+ time.sleep(1)
221
+ my_bar.empty()
222
+
223
+ return data
224
  #---------------------------------------------------Data Cleaning (RegEx)----------------------------------------------------------------------
225
 
226
  def clean_text(text):