Spaces:
Sleeping
Sleeping
naufalnashif
commited on
Commit
•
2f530ac
1
Parent(s):
209be85
Update app.py
Browse files
app.py
CHANGED
@@ -156,6 +156,71 @@ def scrape_detik_news(query, date, jumlah):
|
|
156 |
time.sleep(1)
|
157 |
my_bar.empty()
|
158 |
return data
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
159 |
#---------------------------------------------------Data Cleaning (RegEx)----------------------------------------------------------------------
|
160 |
|
161 |
def clean_text(text):
|
|
|
156 |
time.sleep(1)
|
157 |
my_bar.empty()
|
158 |
return data
|
159 |
+
|
160 |
+
@st.cache_data
|
161 |
+
def scrape_viva_data(query, date, jumlah):
|
162 |
+
data = []
|
163 |
+
page = 1
|
164 |
+
progress_text = "Scraping in progress. Please wait."
|
165 |
+
my_bar = st.progress(len(data), text=progress_text)
|
166 |
+
|
167 |
+
|
168 |
+
while len (data) < jumlah :
|
169 |
+
try :
|
170 |
+
|
171 |
+
url = f"https://www.viva.co.id/search?q={query}"
|
172 |
+
user_agents = [
|
173 |
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36",
|
174 |
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/17.17134",
|
175 |
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1.2 Safari/605.1.15",
|
176 |
+
]
|
177 |
+
|
178 |
+
# Mendapatkan User-Agent acak
|
179 |
+
random_user_agent = random.choice(user_agents)
|
180 |
+
|
181 |
+
# Menggunakan User-Agent dalam permintaan HTTP
|
182 |
+
headers = {
|
183 |
+
"User-Agent": random_user_agent,
|
184 |
+
"Accept-Language": "en-US,en;q=0.5"
|
185 |
+
}
|
186 |
+
timeout = 10
|
187 |
+
response = requests.get(url, headers=headers, timeout = timeout)
|
188 |
+
soup = BeautifulSoup(response.content, 'html.parser')
|
189 |
+
|
190 |
+
articles = soup.find_all('div', class_ = 'article-list-container')
|
191 |
+
|
192 |
+
if not articles:
|
193 |
+
break
|
194 |
+
|
195 |
+
for article in articles:
|
196 |
+
title = article.find('h2').text.strip()
|
197 |
+
link = article.find('a')['href']
|
198 |
+
category = article.find('h3').text.strip()
|
199 |
+
date = article.find('div', class_='article-list-date content_center').text.strip()
|
200 |
+
|
201 |
+
data.append({
|
202 |
+
'category': category,
|
203 |
+
'date': date,
|
204 |
+
'judul-berita': title,
|
205 |
+
'link-berita': link,
|
206 |
+
})
|
207 |
+
if len(data) > jumlah:
|
208 |
+
data = data[:jumlah]
|
209 |
+
break
|
210 |
+
|
211 |
+
prop = min(len(data) / jumlah, 1)
|
212 |
+
my_bar.progress(prop, text=progress_text)
|
213 |
+
page += 1
|
214 |
+
except requests.exceptions.RequestException as e:
|
215 |
+
st.error(f"An error occurred: {e}")
|
216 |
+
break
|
217 |
+
|
218 |
+
|
219 |
+
|
220 |
+
time.sleep(1)
|
221 |
+
my_bar.empty()
|
222 |
+
|
223 |
+
return data
|
224 |
#---------------------------------------------------Data Cleaning (RegEx)----------------------------------------------------------------------
|
225 |
|
226 |
def clean_text(text):
|