|
|
|
import streamlit as st |
|
import pandas as pd |
|
import random |
|
import numpy as np |
|
import re |
|
import json |
|
import matplotlib.pyplot as plt |
|
import seaborn as sns |
|
from wordcloud import WordCloud |
|
import requests |
|
from bs4 import BeautifulSoup |
|
from datetime import date |
|
import time |
|
from collections import Counter |
|
import nltk |
|
from nltk.corpus import stopwords |
|
|
|
|
|
|
|
|
|
@st.cache_data |
|
def scrape_cnbc_data(query, date, jumlah, param_kosong): |
|
data = [] |
|
page = 1 |
|
progress_text = "Scraping in progress. Please wait." |
|
my_bar = st.progress(len(data), text=progress_text) |
|
|
|
|
|
while len (data) < jumlah : |
|
try : |
|
|
|
url = f"https://www.cnbcindonesia.com/search?query={query}&p={page}&kanal=&tipe=artikel&date={date}" |
|
user_agents = [ |
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36", |
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/17.17134", |
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1.2 Safari/605.1.15", |
|
] |
|
|
|
|
|
random_user_agent = random.choice(user_agents) |
|
|
|
|
|
headers = { |
|
"User-Agent": random_user_agent, |
|
"Accept-Language": "en-US,en;q=0.5" |
|
} |
|
timeout = 10 |
|
response = requests.get(url, headers=headers, timeout = timeout) |
|
soup = BeautifulSoup(response.content, 'html.parser') |
|
|
|
articles = soup.find_all('article') |
|
|
|
if not articles: |
|
break |
|
|
|
for article in articles: |
|
title = article.find('h2').text.strip() |
|
link = article.find('a')['href'] |
|
category = article.find('span', class_ = 'label').text.strip() |
|
date_category = article.find('span', class_='date').text.strip() |
|
text_parts = date_category.split(' - ') |
|
date = text_parts[1].strip() |
|
|
|
data.append({ |
|
'category': category, |
|
'date': date, |
|
'judul-berita': title, |
|
'link-berita': link, |
|
}) |
|
if len(data) > jumlah: |
|
data = data[:jumlah] |
|
break |
|
|
|
prop = min(len(data) / jumlah, 1) |
|
my_bar.progress(prop, text=progress_text) |
|
page += 1 |
|
except requests.exceptions.RequestException as e: |
|
st.error(f"An error occurred: {e}") |
|
break |
|
|
|
|
|
|
|
time.sleep(1) |
|
my_bar.empty() |
|
|
|
return data |
|
|
|
|
|
@st.cache_data |
|
def scrape_detik_news(query, date, jumlah, param_kosong): |
|
start_page = 1 |
|
base_url = "https://www.detik.com/search/searchall" |
|
data = [] |
|
progress_text = "Scraping in progress... Please wait..." |
|
my_bar = st.progress(len(data), text=progress_text) |
|
timeout = 10 |
|
|
|
while len(data) < jumlah: |
|
try: |
|
params = { |
|
"query": query, |
|
"siteid": 2, |
|
"sortby": "time", |
|
"page": start_page |
|
} |
|
|
|
url = f'https://www.detik.com/search/searchall?query={query}&siteid=2&sortby=time&page={start_page}' |
|
|
|
user_agents = [ |
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36", |
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/17.17134", |
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1.2 Safari/605.1.15", |
|
] |
|
|
|
|
|
random_user_agent = random.choice(user_agents) |
|
|
|
|
|
headers = { |
|
"User-Agent": random_user_agent, |
|
"Accept-Language": "en-US,en;q=0.5" |
|
} |
|
response = requests.get(url, headers=headers, timeout = timeout) |
|
response.raise_for_status() |
|
|
|
soup = BeautifulSoup(response.text, 'html.parser') |
|
articles = soup.find_all('article') |
|
|
|
if not articles : |
|
break |
|
for article in articles : |
|
title = article.find('h2').text.strip() |
|
link = article.find('a')['href'] |
|
category = article.find('span', class_='category').text |
|
date_category = article.find('span', class_='date').text |
|
date = date_category.replace(category, '').strip() |
|
data.append({ |
|
'category': category, |
|
'date': date, |
|
'judul-berita': title, |
|
'link-berita': link, |
|
}) |
|
|
|
if len(data) >= jumlah: |
|
data = data[:jumlah] |
|
break |
|
|
|
prop = min(len(data) / jumlah, 1) |
|
my_bar.progress(prop, text=progress_text) |
|
|
|
start_page += 1 |
|
except requests.exceptions.RequestException as e: |
|
st.error(f"An error occurred: {e}") |
|
break |
|
|
|
time.sleep(1) |
|
my_bar.empty() |
|
return data |
|
|
|
@st.cache_data |
|
def scrape_viva_data(query, date, jumlah, param_kosong): |
|
data = [] |
|
page = 1 |
|
progress_text = "Scraping in progress. Please wait." |
|
my_bar = st.progress(len(data), text=progress_text) |
|
|
|
|
|
while len (data) < jumlah : |
|
try : |
|
|
|
url = f"https://www.viva.co.id/search?q={query}" |
|
|
|
user_agents = [ |
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36", |
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/17.17134", |
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1.2 Safari/605.1.15", |
|
] |
|
|
|
|
|
random_user_agent = random.choice(user_agents) |
|
|
|
|
|
headers = { |
|
"User-Agent": random_user_agent, |
|
"Accept-Language": "en-US,en;q=0.5" |
|
} |
|
timeout = 10 |
|
response = requests.get(url, headers=headers, timeout = timeout) |
|
soup = BeautifulSoup(response.content, 'html.parser') |
|
|
|
articles = soup.find_all('div', class_='card-box ft240 margin-bottom-sm') |
|
if not articles : |
|
break |
|
|
|
for article in articles : |
|
|
|
title = article.find('h2', class_='title').text |
|
link = article.find('a')['href'] |
|
category_element = article.find('span', class_="kanal cl-dark") |
|
category = category_element.text.strip() if category_element else None |
|
date_element = article.find('h4', class_="date") |
|
date_before = date_element.text.strip() if date_element else None |
|
date = date_before.replace(category, '') |
|
data.append({ |
|
'category': category, |
|
'date': date, |
|
'judul-berita': title, |
|
'link-berita': link, |
|
}) |
|
if len(data) > jumlah: |
|
data = data[:jumlah] |
|
break |
|
|
|
prop = min(len(data) / jumlah, 1) |
|
my_bar.progress(prop, text=progress_text) |
|
page += 1 |
|
except requests.exceptions.RequestException as e: |
|
st.error(f"An error occurred: {e}") |
|
break |
|
|
|
|
|
|
|
time.sleep(1) |
|
my_bar.empty() |
|
|
|
return data |
|
|
|
@st.cache_data |
|
def scrape_tempo_data(query, date, jumlah, selected_channel): |
|
data = [] |
|
domain = 1 |
|
max_domains = 5 |
|
progress_text = "Scraping in progress. Please wait." |
|
my_bar = st.progress(len(data), text=progress_text) |
|
|
|
default_channels = { |
|
'All(Latest Only)': '', |
|
'Nasional': '20', |
|
'Metro': '19', |
|
'Dunia': '5', |
|
'Bisnis': '1', |
|
'Bola': '21', |
|
'Sport': '33', |
|
'Gaya': '9', |
|
'Seleb': '32', |
|
'Cantik': '2', |
|
'Tekno': '34', |
|
'Otomotif': '23', |
|
'Travel': '35', |
|
'Blog': '43', |
|
'Difabel': '44', |
|
'Ramadan': '30', |
|
'Kolom': '14', |
|
'Fokus': '8', |
|
'Creative Lab': '47', |
|
'Event': '62', |
|
'Data': '65', |
|
'Cek Fakta': '66', |
|
'Newsletter': '63', |
|
'Inforial': '12' |
|
} |
|
|
|
|
|
if selected_channel != 'Defaults' and selected_channel in default_channels: |
|
channels = {selected_channel: default_channels[selected_channel]} |
|
else: |
|
channels = default_channels |
|
seen_titles = set() |
|
|
|
try: |
|
while len(data) < jumlah and domain <= max_domains: |
|
for kanal, value in channels.items(): |
|
url = f"https://www.tempo.co/search?waktu={waktu}&kanal={value}&subkanal=&domain={domain}&q={query}" |
|
user_agents = [ |
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36", |
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/17.17134", |
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1.2 Safari/605.1.15", |
|
] |
|
|
|
random_user_agent = random.choice(user_agents) |
|
|
|
headers = { |
|
"User-Agent": random_user_agent, |
|
"Accept-Language": "en-US,en;q=0.5" |
|
} |
|
timeout = 10 |
|
response = requests.get(url, headers=headers, timeout=timeout) |
|
soup = BeautifulSoup(response.text, 'html.parser') |
|
articles = soup.find_all('div', class_='card-box ft240 margin-bottom-sm') |
|
if not articles: |
|
break |
|
for article in articles: |
|
title = article.find('h2', class_='title').text |
|
|
|
if title not in seen_titles: |
|
link = article.find('a')['href'] |
|
category_element = article.find('span', class_="kanal cl-dark") |
|
category = category_element.text.strip() if category_element else None |
|
date_element = article.find('h4', class_="date") |
|
date_before = date_element.text.strip() if date_element else None |
|
date = date_before.replace(category, '') |
|
data.append({ |
|
'category': category, |
|
'kanal' : kanal, |
|
'date': date, |
|
'judul-berita': title, |
|
'link-berita': link, |
|
}) |
|
seen_titles.add(title) |
|
if len(data) >= jumlah: |
|
break |
|
if len(data) >= jumlah: |
|
break |
|
prop = min(len(data) / jumlah, 1) |
|
my_bar.progress(prop, text=progress_text) |
|
domain += 1 |
|
except requests.exceptions.RequestException as e: |
|
st.error(f"An error occurred: {e}") |
|
time.sleep(1) |
|
my_bar.empty() |
|
return data |
|
|
|
|
|
def clean_text(text): |
|
|
|
if not isinstance(text, str): |
|
text = str(text) |
|
|
|
text = re.sub(r'[^\x00-\x7F]+', '', text) |
|
|
|
|
|
text = re.sub(r'http[s]?://.[a-zA-Z0-9./_?=%&#+!]+', '', text) |
|
text = re.sub(r'pic.twitter.com?.[a-zA-Z0-9./_?=%&#+!]+', '', text) |
|
|
|
|
|
text = re.sub(r'@[\w]+', '', text) |
|
|
|
|
|
text = re.sub(r'#([\w]+)', '', text) |
|
|
|
|
|
text = re.sub(r'&|>', '', text) |
|
|
|
|
|
text = re.sub(r'[!$%^&*@#()_+|~=`{}\[\]%\-:";\'<>?,./]', '', text) |
|
|
|
|
|
text = re.sub(r'[0-9]+', '', text) |
|
|
|
|
|
text = re.sub(' +', ' ', text) |
|
|
|
|
|
text = text.strip() |
|
|
|
|
|
text = text.lower() |
|
|
|
|
|
|
|
|
|
text = re.sub(r'(\w)\1{2,}', r'\1', text) |
|
|
|
return text |
|
|
|
|
|
|
|
|
|
kamus_path = '_json_colloquial-indonesian-lexicon.txt' |
|
with open(kamus_path) as f: |
|
data = f.read() |
|
lookp_dict = json.loads(data) |
|
|
|
|
|
kamus_sendiri_path = 'kamus_gaul_custom.txt' |
|
with open(kamus_sendiri_path) as f: |
|
kamus_sendiri = f.read() |
|
kamus_gaul_baru = json.loads(kamus_sendiri) |
|
|
|
|
|
lookp_dict.update(kamus_gaul_baru) |
|
|
|
|
|
def normalize_slang(text, slang_dict): |
|
words = text.split() |
|
normalized_words = [slang_dict.get(word, word) for word in words] |
|
return ' '.join(normalized_words) |
|
|
|
|
|
|
|
|
|
nltk.download("stopwords") |
|
stop_words = set(stopwords.words("indonesian")) |
|
|
|
def remove_stopwords(text, stop_words): |
|
|
|
words = text.split() |
|
|
|
|
|
words = [word for word in words if word not in stop_words] |
|
|
|
return " ".join(words) |
|
|
|
|
|
def preprocessing_data(hidden_data): |
|
|
|
results_prep = [] |
|
df = pd.DataFrame(hidden_data) |
|
texts = df["judul-berita"] |
|
|
|
for text in texts: |
|
cleaned_text = clean_text(text) |
|
norm_slang_text = normalize_slang(cleaned_text, lookp_dict) |
|
tanpa_stopwords = remove_stopwords(norm_slang_text, stop_words) |
|
|
|
results_prep.append({ |
|
'judul-berita': text, |
|
'cleaned-text' : cleaned_text, |
|
'normalisasi-text' : norm_slang_text, |
|
'stopwords-remove' : tanpa_stopwords, |
|
}) |
|
return results_prep |
|
|
|
|
|
def eksplorasi_data(selected_options, results, colormap, words): |
|
|
|
if 'Hasil EDA' in selected_options: |
|
|
|
columns = st.columns(2) |
|
all_texts = "" |
|
with columns[0]: |
|
if results: |
|
all_texts = all_texts = [result.get('stopwords-remove') for result in results if pd.notna(result.get('stopwords-remove'))] |
|
all_texts = " ".join(all_texts) |
|
|
|
st.subheader("Word Cloud") |
|
|
|
if all_texts: |
|
wordcloud = WordCloud(width=800, height=500, background_color='white', |
|
colormap=colormap, |
|
contour_color='black', |
|
contour_width=2, |
|
mask=None).generate(all_texts) |
|
st.image(wordcloud.to_array()) |
|
|
|
|
|
with columns[1]: |
|
st.subheader("Most Common Words") |
|
|
|
if all_texts: |
|
word_counts = Counter(all_texts.split()) |
|
most_common_words = word_counts.most_common(words) |
|
|
|
words, counts = zip(*most_common_words) |
|
|
|
fig, ax = plt.subplots(figsize=(10, 6)) |
|
ax.bar(words, counts) |
|
ax.set_xlabel("Kata-kata") |
|
ax.set_ylabel("Jumlah") |
|
ax.set_title("Kata-kata Paling Umum") |
|
ax.tick_params(axis='x', rotation=45) |
|
|
|
st.pyplot(fig) |
|
@st.cache_data |
|
def scrape_and_explore_data(_scrape_function, query, date, jumlah, selected_options, colormap, words, param): |
|
data_df = _scrape_function(query, date, jumlah, param) |
|
hidden_data = data_df |
|
scraping_done = True |
|
results = preprocessing_data(hidden_data) |
|
|
|
|
|
eksplorasi_data(selected_options, results, colormap, words) |
|
return hidden_data, scraping_done, results |
|
|
|
|
|
|
|
st.title("Aplikasi Web Scraping & Explorasi Data") |
|
|
|
with st.expander("Scraping Settings :"): |
|
|
|
selected_site = st.selectbox("Pilih Situs Web :", ["CNBC Indonesia", "Detik.com", "Viva.co.id", "Tempo.co", "Liputan6.com"]) |
|
if selected_site == "Tempo.co": |
|
waktu = st.selectbox("Pilih Rentang Waktu :", ["1tahun", "1bulan", "1minggu", "1hari", "6jam"]) |
|
selected_channel = st.selectbox("Pilih Kanal :", ['Defaults','All(Latest Only)', 'Nasional', 'Metro', 'Dunia', 'Bisnis', 'Bola', 'Sport', 'Gaya', 'Seleb', 'Cantik', 'Tekno', 'Otomotif', 'Travel', 'Blog', 'Difabel', 'Ramadan', 'Kolom', 'Fokus', 'Creative Lab', 'Event', 'Data', 'Cek Fakta', 'Newsletter', 'Inforial']) |
|
query = st.text_input("Masukkan Query :").replace(' ', '+') |
|
|
|
jumlah = st.number_input("Masukkan Estimasi Banyak Data :", min_value = 1, step = 1, placeholder="Type a number...") |
|
date = date.today() |
|
download_format = st.selectbox("Pilih Format Unduhan :", ["XLSX", "CSV", "JSON", "TXT"]) |
|
param_kosong = [] |
|
with st.expander("Preference Settings :"): |
|
selected_options = st.multiselect( |
|
'Pilih tampilan:', |
|
['Hasil Scraping', 'Hasil Preprocessing', 'Hasil EDA'], |
|
["Hasil Scraping", "Hasil EDA"] |
|
) |
|
if "Hasil EDA" in selected_options: |
|
colormap = st.selectbox("Pilih Warna Wordclouds :", ["Greys", "Purples", "Blues", "Greens", "Oranges", "Reds", "YlOrBr", "YlOrRd", "OrRd", "PuRd", "RdPu", "BuPu", "GnBu", "PuBu", "YlGnBu", "PuBuGn", "BuGn", "YlGn"]) |
|
words = st.number_input("Masukkan Jumlah Most Common Words :", min_value = 1, max_value = 15, step = 1, value = 10, placeholder="Type a number...") |
|
else : |
|
colormap = "Greys" |
|
words = 10 |
|
|
|
st.info('Tekan "Mulai Scraping" kembali jika tampilan menghilang ', icon="ℹ️") |
|
|
|
|
|
|
|
|
|
hidden_data = [] |
|
|
|
scraping_done = False |
|
|
|
if st.button("Mulai Scraping"): |
|
if not query: |
|
st.error("Mohon isi query.") |
|
else: |
|
|
|
if selected_site == "CNBC Indonesia": |
|
hidden_data, scraping_done, results = scrape_and_explore_data(scrape_cnbc_data, query, date.strftime("%Y/%m/%d"), jumlah, selected_options, colormap, words, param_kosong) |
|
|
|
|
|
elif selected_site == "Detik.com": |
|
hidden_data, scraping_done, results = scrape_and_explore_data(scrape_detik_news, query, date, jumlah, selected_options, colormap, words, param_kosong) |
|
|
|
|
|
elif selected_site == "Viva.co.id": |
|
st.warning("Masih dalam penegmbangan, silahkan gunakan situs yang lain.") |
|
hidden_data, scraping_done, results = scrape_and_explore_data(scrape_viva_data, query, date, jumlah, selected_options, colormap, words, param_kosong) |
|
|
|
|
|
elif selected_site == "Tempo.co": |
|
st.warning("Masih dalam penegmbangan, silahkan gunakan situs yang lain.") |
|
hidden_data, scraping_done, results = scrape_and_explore_data(scrape_tempo_data, query, waktu, jumlah, selected_options, colormap, words, selected_channel) |
|
|
|
|
|
elif selected_site == "Liputan6.com": |
|
st.error("Belum bisa dipakai.") |
|
|
|
|
|
|
|
|
|
if scraping_done: |
|
if hidden_data: |
|
df = pd.DataFrame(hidden_data) |
|
df_prep = pd.DataFrame(results) |
|
|
|
if 'Hasil Scraping' in selected_options: |
|
with st.expander(f"Hasil Scraping {selected_site} :"): |
|
st.write(df) |
|
if 'Hasil Preprocessing' in selected_options: |
|
with st.expander(f"Hasil Preprocessing Data :"): |
|
st.write(df_prep) |
|
if download_format == "XLSX": |
|
df.to_excel(f"hasil_scraping_{query}.xlsx", index=False) |
|
df_prep.to_excel(f"hasil_preprocess_{query}.xlsx", index=False) |
|
st.download_button(label=f"Unduh Hasil Scraping XLSX ({len(hidden_data)} data)", data=open(f"hasil_scraping_{query}.xlsx", "rb").read(), key="xlsx_download", file_name=f"hasil_scraping_{query}.xlsx") |
|
st.download_button(label=f"Unduh Hasil Preprocess XLSX ({len(results)} data)", data=open(f"hasil_preprocess_{query}.xlsx", "rb").read(), key="xlsx_download_2", file_name=f"hasil_preprocess_{query}.xlsx") |
|
elif download_format == "CSV": |
|
csv = df.to_csv(index=False) |
|
csv_prep = df_prep.to_csv(index = False) |
|
st.download_button(label=f"Unduh Hasil Scraping CSV ({len(hidden_data)} data)", data=csv, key="csv_download", file_name=f"hasil_scraping_{query}.csv") |
|
st.download_button(label=f"Unduh Hasil Preprocess CSV ({len(results)} data)", data=csv_prep, key="csv_download_2", file_name=f"hasil_preprocess_{query}.csv") |
|
elif download_format == "JSON": |
|
json_data = pd.DataFrame(hidden_data, columns=["date", "judul-berita", "link-berita"]).to_json(orient="records") |
|
json_data_prep = pd.DataFrame(results, columns=["Teks", "Cleaned Text", "Norm Text", "Tanpa Stopwords"]).to_json(orient="records") |
|
st.download_button(label=f"Unduh Hasil Scraping JSON ({len(hidden_data)} data)", data=json_data, key="json_download", file_name=f"hasil_scraping_{query}.json") |
|
st.download_button(label=f"Unduh Hasil Preprocess JSON ({len(results)} data)", data=json_data_prep, key="json_download_2", file_name=f"hasil_preprocess_{query}.json") |
|
elif download_format == "TXT": |
|
text_data = "\n".join([f"{row['date']} - {row['judul-berita']} - {row['link-berita']}" for row in hidden_data]) |
|
|
|
st.download_button(label=f"Unduh Hasil Scraping TXT ({len(hidden_data)} data)", data=text_data, key="txt_download", file_name=f"hasil_scraping_{query}.txt") |
|
|
|
if not hidden_data: |
|
st.warning(f"Tidak ada data pada query '{query}'", icon="⚠️") |
|
if not scraping_done: |
|
st.write("Tidak ada data untuk diunduh.") |
|
|
|
st.divider() |
|
github_link = "https://github.com/naufalnashif/" |
|
st.markdown(f"GitHub: [{github_link}]({github_link})") |
|
instagram_link = "https://www.instagram.com/naufal.nashif/" |
|
st.markdown(f"Instagram: [{instagram_link}]({instagram_link})") |
|
st.write('Terima kasih telah mencoba demo ini!') |
|
|