naufalnashif commited on
Commit
a2f0a76
1 Parent(s): bb86740

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +398 -0
app.py ADDED
@@ -0,0 +1,398 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #---------------------------------------------------Requirements----------------------------------------------------------------------
2
+ import streamlit as st
3
+ import pandas as pd
4
+ import numpy as np
5
+ import re
6
+ import json
7
+ import matplotlib.pyplot as plt
8
+ import seaborn as sns
9
+ from wordcloud import WordCloud
10
+ import requests
11
+ from bs4 import BeautifulSoup
12
+ from datetime import date
13
+ import time
14
+ from collections import Counter
15
+ import nltk
16
+ from nltk.corpus import stopwords
17
+
18
+
19
+ #---------------------------------------------------Scraping Function----------------------------------------------------------------------
20
+
21
+ @st.cache_data
22
+ def scrape_cnbc_data(query, date, jumlah):
23
+ data = []
24
+ page = 1
25
+ progress_text = "Scraping in progress. Please wait."
26
+ my_bar = st.progress(len(data), text=progress_text)
27
+
28
+
29
+ for percent_complete in range(jumlah):
30
+
31
+ if len(data) > jumlah:
32
+ data = data[:jumlah]
33
+ break
34
+
35
+ prop = min(len(data) / jumlah, 1)
36
+ my_bar.progress(prop, text=progress_text)
37
+ base_url = f"https://www.cnbcindonesia.com/search?query={query}&p={page}&kanal=&tipe=artikel&date={date}"
38
+ headers = {
39
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
40
+ }
41
+
42
+ response = requests.get(base_url, headers=headers)
43
+ soup = BeautifulSoup(response.content, 'html.parser')
44
+
45
+ articles = soup.find_all('article')
46
+
47
+ if not articles:
48
+ break
49
+
50
+ for article in articles:
51
+ title = article.find('h2').text.strip()
52
+ link = article.find('a')['href']
53
+ date = article.find('span', class_='date').text.strip()
54
+
55
+ data.append({
56
+ 'date': date,
57
+ 'judul-berita': title,
58
+ 'link-berita': link,
59
+ })
60
+
61
+ page += 1
62
+
63
+ time.sleep(1)
64
+ my_bar.empty()
65
+
66
+ return data
67
+
68
+
69
+ @st.cache_data
70
+ def scrape_detik_news(query, jumlah):
71
+ site_id = 2
72
+ data = []
73
+ page = 1
74
+ progress_text = "Scraping in progress. Please wait."
75
+ my_bar = st.progress(len(data), text=progress_text)
76
+
77
+ for percent_complete in range(jumlah):
78
+
79
+ if len(data) > jumlah:
80
+ data = data[:jumlah]
81
+ break
82
+
83
+ prop = min(len(data) / jumlah, 1)
84
+ my_bar.progress(prop, text=progress_text)
85
+ base_url = "https://www.detik.com/search/searchall"
86
+ params = {
87
+ "query": query,
88
+ "siteid": site_id,
89
+ "page": page
90
+ }
91
+
92
+ response = requests.get(base_url, params=params)
93
+ soup = BeautifulSoup(response.content, "html.parser")
94
+
95
+ articles = soup.find_all("article")
96
+
97
+
98
+ if not articles:
99
+ break
100
+
101
+ for article in articles:
102
+ date = article.find("span", class_="date").text.strip()
103
+ title = article.find("h2", class_="title").text.strip()
104
+ link = article.find("a")["href"]
105
+ data.append({"date": date, "judul-berita": title, "link-berita": link})
106
+
107
+ page += 1
108
+
109
+ time.sleep(1)
110
+ my_bar.empty()
111
+ return data
112
+
113
+ #---------------------------------------------------Data Cleaning (RegEx)----------------------------------------------------------------------
114
+
115
+ def clean_text(text):
116
+ # Tahap-1: Menghapus karakter non-ASCII
117
+ text = re.sub(r'[^\x00-\x7F]+', '', text)
118
+
119
+ # Tahap-2: Menghapus URL
120
+ text = re.sub(r'http[s]?://.[a-zA-Z0-9./_?=%&#+!]+', '', text)
121
+ text = re.sub(r'pic.twitter.com?.[a-zA-Z0-9./_?=%&#+!]+', '', text)
122
+
123
+ # Tahap-3: Menghapus mentions
124
+ text = re.sub(r'@[\w]+', '', text)
125
+
126
+ # Tahap-4: Menghapus hashtag
127
+ text = re.sub(r'#([\w]+)', '', text)
128
+
129
+ # Tahap-5 Menghapus 'amp' yang menempel pada '&' dan 'gt' yang menempel pada '&'
130
+ text = re.sub(r'&|>', '', text)
131
+
132
+ # Tahap-6: Menghapus karakter khusus (simbol)
133
+ text = re.sub(r'[!$%^&*@#()_+|~=`{}\[\]%\-:";\'<>?,./]', '', text)
134
+
135
+ # Tahap-7: Menghapus angka
136
+ text = re.sub(r'[0-9]+', '', text)
137
+
138
+ # Tahap-8: Menggabungkan spasi ganda menjadi satu spasi
139
+ text = re.sub(' +', ' ', text)
140
+
141
+ # Tahap-9: Menghapus spasi di awal dan akhir kalimat
142
+ text = text.strip()
143
+
144
+ # Tahap-10: Konversi teks ke huruf kecil
145
+ text = text.lower()
146
+
147
+ # Tahap-11: koreksi duplikasi tiga karakter beruntun atau lebih (contoh. yukkk)
148
+ # text = re.sub(r'([a-zA-Z])\1\1', '\\1', text)
149
+ #text = re.sub(r'(.)(\1{2,})', r'\1\1', text)
150
+ text = re.sub(r'(\w)\1{2,}', r'\1', text)
151
+
152
+ return text
153
+
154
+ #---------------------------------------------------Normalisasi----------------------------------------------------------------------
155
+
156
+ # Membaca kamus kata gaul Salsabila
157
+ kamus_path = '_json_colloquial-indonesian-lexicon.txt' # Ganti dengan path yang benar
158
+ with open(kamus_path) as f:
159
+ data = f.read()
160
+ lookp_dict = json.loads(data)
161
+
162
+ # Dict kata gaul saya sendiri yang tidak masuk di dict Salsabila
163
+ kamus_sendiri_path = 'kamus_gaul_custom.txt'
164
+ with open(kamus_sendiri_path) as f:
165
+ kamus_sendiri = f.read()
166
+ kamus_gaul_baru = json.loads(kamus_sendiri)
167
+
168
+ # Menambahkan dict kata gaul baru ke kamus yang sudah ada
169
+ lookp_dict.update(kamus_gaul_baru)
170
+
171
+ # Fungsi untuk normalisasi kata gaul
172
+ def normalize_slang(text, slang_dict):
173
+ words = text.split()
174
+ normalized_words = [slang_dict.get(word, word) for word in words]
175
+ return ' '.join(normalized_words)
176
+
177
+ #---------------------------------------------------NLTK Remove Stopwords----------------------------------------------------------------------
178
+
179
+ # Inisialisasi stopwords bahasa Indonesia
180
+ nltk.download("stopwords")
181
+ stop_words = set(stopwords.words("indonesian"))
182
+
183
+ def remove_stopwords(text, stop_words):
184
+ # Pecah teks menjadi kata-kata
185
+ words = text.split()
186
+
187
+ # Hapus stopwords bahasa Indonesia
188
+ words = [word for word in words if word not in stop_words]
189
+
190
+ return " ".join(words)
191
+
192
+
193
+ #---------------------------------------------------User Interface----------------------------------------------------------------------
194
+
195
+ # Streamlit UI
196
+ st.title("Aplikasi Web Scraping CNBC / Detik.com & Explorasi Data")
197
+
198
+ # Pilihan untuk memilih situs web
199
+ selected_site = st.selectbox("Pilih Situs Web :", ["CNBC Indonesia", "Detik.com"])
200
+
201
+ query = st.text_input("Masukkan Query :")
202
+ jumlah = st.number_input("Masukkan Estimasi Banyak Data :", min_value = 1, step = 1, placeholder="Type a number...")
203
+
204
+ date = date.today()
205
+ download_format = st.selectbox("Pilih Format Unduhan :", ["XLSX", "CSV", "JSON", "TXT"])
206
+ st.info('Tekan "Mulai Scraping" kembali jika tampilan menghilang ', icon="ℹ️")
207
+
208
+ # Variabel tersembunyi untuk menyimpan hasil scraping
209
+ hidden_data = []
210
+
211
+ scraping_done = False # Tambahkan variabel ini
212
+
213
+ #---------------------------------------------------CNBC Indonesia----------------------------------------------------------------------
214
+
215
+ if selected_site == "CNBC Indonesia":
216
+ if st.button("Mulai Scraping"):
217
+ if not query:
218
+ st.error("Mohon isi query.")
219
+ else:
220
+ data_df = scrape_cnbc_data(query, date.strftime("%Y/%m/%d"), jumlah)
221
+ hidden_data = data_df # Simpan data ke dalam variabel tersembunyi
222
+ scraping_done = True # Set scraping_done menjadi True
223
+
224
+ #---------------------------------------------------Eksplorasi Data--------------------------------------------------
225
+
226
+ df = pd.DataFrame(hidden_data, columns=["date", "judul-berita", "link-berita"])
227
+ texts = df["judul-berita"]
228
+
229
+ # Initialize results
230
+ results = []
231
+
232
+ # Process the text data
233
+ for text in texts:
234
+ cleaned_text = clean_text(text)
235
+ norm_slang_text = normalize_slang(cleaned_text, lookp_dict)
236
+ tanpa_stopwords = remove_stopwords(norm_slang_text, stop_words)
237
+
238
+ results.append((text, cleaned_text, norm_slang_text, tanpa_stopwords))
239
+
240
+ # Membagi tampilan menjadi dua kolom
241
+ columns = st.columns(2)
242
+
243
+ # Inisialisasi all_texts di luar blok with columns[0]
244
+ all_texts = ""
245
+
246
+ # Kolom pertama untuk Word Cloud
247
+ with columns[0]:
248
+ if results:
249
+ all_texts = [result[3] for result in results if result[3] is not None and not pd.isna(result[3])]
250
+ all_texts = " ".join(all_texts)
251
+
252
+ st.subheader("Word Cloud")
253
+
254
+ if all_texts:
255
+ wordcloud = WordCloud(width=800, height=660, background_color='white',
256
+ colormap='Purples',
257
+ contour_color='black',
258
+ contour_width=2,
259
+ mask=None).generate(all_texts)
260
+ st.image(wordcloud.to_array())
261
+ else:
262
+ st.write("Tidak ada data untuk ditampilkan.")
263
+
264
+
265
+ # Kolom kedua untuk Most Comon Words
266
+ with columns[1]:
267
+ st.subheader("Most Common Words")
268
+
269
+ if all_texts:
270
+ word_counts = Counter(all_texts.split())
271
+ most_common_words = word_counts.most_common(5)
272
+
273
+ words, counts = zip(*most_common_words)
274
+
275
+ fig, ax = plt.subplots(figsize=(10, 6))
276
+ ax.bar(words, counts)
277
+ ax.set_xlabel("Kata-kata")
278
+ ax.set_ylabel("Jumlah")
279
+ ax.set_title("Kata-kata Paling Umum")
280
+ ax.tick_params(axis='x', rotation=45)
281
+
282
+ st.pyplot(fig)
283
+ else:
284
+ st.write("Tidak ada data untuk ditampilkan dalam Word Cloud.")
285
+ if not hidden_data:
286
+ st.warning(f"Tidak ada data pada query '{query}'", icon="⚠️")
287
+
288
+ #---------------------------------------------------Detik.com----------------------------------------------------------------------
289
+
290
+ elif selected_site == "Detik.com":
291
+ if st.button("Mulai Scraping"):
292
+ if not query:
293
+ st.error("Mohon isi query.")
294
+ else:
295
+ data_df = scrape_detik_news(query = query, jumlah = jumlah)
296
+ hidden_data = data_df # Simpan data ke dalam variabel tersembunyi
297
+ scraping_done = True
298
+
299
+ #---------------------------------------------------Eksplorasi Data--------------------------------------------------
300
+
301
+ df = pd.DataFrame(hidden_data, columns=["date", "judul-berita", "link-berita"])
302
+ texts = df["judul-berita"]
303
+
304
+ # Initialize results
305
+ results = []
306
+
307
+ # Process the text data
308
+ for text in texts:
309
+ cleaned_text = clean_text(text)
310
+ norm_slang_text = normalize_slang(cleaned_text, lookp_dict)
311
+ tanpa_stopwords = remove_stopwords(norm_slang_text, stop_words)
312
+
313
+ results.append((text, cleaned_text, norm_slang_text, tanpa_stopwords))
314
+
315
+ # Membagi tampilan menjadi dua kolom
316
+ columns = st.columns(2)
317
+
318
+ # Inisialisasi all_texts di luar blok with columns[0]
319
+ all_texts = ""
320
+
321
+ # Kolom pertama untuk Word Cloud
322
+ with columns[0]:
323
+ if results:
324
+ all_texts = [result[3] for result in results if result[3] is not None and not pd.isna(result[3])]
325
+ all_texts = " ".join(all_texts)
326
+
327
+ st.subheader("Word Cloud")
328
+
329
+ if all_texts:
330
+ wordcloud = WordCloud(width=800, height=660, background_color='white',
331
+ colormap='Purples',
332
+ contour_color='black',
333
+ contour_width=2,
334
+ mask=None).generate(all_texts)
335
+ st.image(wordcloud.to_array())
336
+ else:
337
+ st.write("Tidak ada data untuk ditampilkan.")
338
+
339
+
340
+ # Kolom kedua untuk Most Common Words
341
+ with columns[1]:
342
+ st.subheader("Most Common Words")
343
+
344
+ if all_texts:
345
+ word_counts = Counter(all_texts.split())
346
+ most_common_words = word_counts.most_common(5)
347
+
348
+ words, counts = zip(*most_common_words)
349
+
350
+ fig, ax = plt.subplots(figsize=(10, 6))
351
+ ax.bar(words, counts)
352
+ ax.set_xlabel("Kata-kata")
353
+ ax.set_ylabel("Jumlah")
354
+ ax.set_title("Kata-kata Paling Umum")
355
+ ax.tick_params(axis='x', rotation=45)
356
+
357
+ st.pyplot(fig)
358
+ else:
359
+ st.write("Tidak ada data untuk ditampilkan dalam Word Cloud.")
360
+ if not hidden_data:
361
+ st.warning(f"Tidak ada data pada query '{query}'", icon="⚠️")
362
+
363
+ #---------------------------------------------------Download File & Hasil Scraping----------------------------------------------------------------------
364
+
365
+ # Tampilkan hasil scraping
366
+ if scraping_done:
367
+ if hidden_data:
368
+ if download_format == "XLSX":
369
+ st.subheader("Hasil Scraping")
370
+ st.write(pd.DataFrame(hidden_data, columns=["date", "judul-berita", "link-berita"]))
371
+ df = pd.DataFrame(hidden_data, columns=["date", "judul-berita", "link-berita"])
372
+ df.to_excel("hasil_scraping.xlsx", index=False)
373
+ st.download_button(label=f"Unduh XLSX ({len(hidden_data)} data)", data=open("hasil_scraping.xlsx", "rb").read(), key="xlsx_download", file_name="hasil_scraping.xlsx")
374
+ elif download_format == "CSV":
375
+ st.subheader("Hasil Scraping")
376
+ st.write(pd.DataFrame(hidden_data, columns=["date", "judul-berita", "link-berita"]))
377
+ df = pd.DataFrame(hidden_data, columns=["date", "judul-berita", "link-berita"])
378
+ csv = df.to_csv(index=False)
379
+ st.download_button(label=f"Unduh CSV ({len(hidden_data)} data)", data=csv, key="csv_download", file_name="hasil_scraping.csv")
380
+ elif download_format == "JSON":
381
+ st.subheader("Hasil Scraping")
382
+ st.write(pd.DataFrame(hidden_data, columns=["date", "judul-berita", "link-berita"]))
383
+ json_data = pd.DataFrame(hidden_data, columns=["date", "judul-berita", "link-berita"]).to_json(orient="records")
384
+ st.download_button(label=f"Unduh JSON ({len(hidden_data)} data)", data=json_data, key="json_download", file_name="hasil_scraping.json")
385
+ elif download_format == "TXT":
386
+ st.subheader("Hasil Scraping")
387
+ st.write(pd.DataFrame(hidden_data, columns=["date", "judul-berita", "link-berita"]))
388
+ text_data = "\n".join([f"{row['date']} - {row['judul-berita']} - {row['link-berita']}" for row in hidden_data])
389
+ st.download_button(label=f"Unduh TXT ({len(hidden_data)} data)", data=text_data, key="txt_download", file_name="hasil_scraping.txt")
390
+ if not scraping_done:
391
+ st.write("Tidak ada data untuk diunduh.")
392
+
393
+ st.divider()
394
+ github_link = "https://github.com/naufalnashif/"
395
+ st.markdown(f"GitHub: [{github_link}]({github_link})")
396
+ instagram_link = "https://www.instagram.com/naufal.nashif/"
397
+ st.markdown(f"Instagram: [{instagram_link}]({instagram_link})")
398
+ st.write('Terima kasih telah mencoba demo ini!')