Spaces:
Runtime error
Runtime error
import gradio as gr # Untuk UI | |
from transformers import pipeline | |
import pandas as pd | |
from torch.utils.data import Dataset, DataLoader | |
import torch | |
import gc | |
import re | |
from tqdm import tqdm | |
import matplotlib.pyplot as plt | |
import snscrape.modules.twitter as sntwitter | |
import datetime as dt | |
import sys | |
import os | |
def scrape_tweets(query, max_tweets=-1,output_path="./scraper/output/" ): | |
if not os.path.exists(output_path): | |
os.makedirs(output_path) | |
output_path = os.path.join(output_path,dt.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")+"-"+str(query)+".csv") | |
tweets_list = [] | |
if sys.version_info.minor>=8: | |
try: | |
for i,tweet in tqdm(enumerate(sntwitter.TwitterSearchScraper(query).get_items())): | |
if max_tweets != -1 and i >= int(max_tweets): | |
break | |
tweets_list.append([tweet.date, tweet.id, tweet.content, tweet.user.username, tweet.likeCount, tweet.retweetCount, tweet.replyCount, tweet.quoteCount, tweet.url, tweet.lang]) | |
except KeyboardInterrupt: | |
print("Scraping berhenti atas permintaan pengguna") | |
df = pd.DataFrame(tweets_list, columns=['Datetime', 'Tweet Id', 'Text', 'Username', 'Likes', 'Retweets', 'Replies', 'Quotes', 'URL', 'Language']) | |
print("Tweet berbahasa Indonesia :",len(df[df["Language"] == "in"]),"/",len(tweets_list)) | |
df = df[df["Language"] == "in"] | |
#Karena Google Colab menggunakan versi 3.7, library scrape yang digunakan adalah versi lawas yang tidak lengkap, sehingga kita tidak bisa melakukan filter bahasa Indonesia | |
else: | |
print("Using older version of Python") | |
try: | |
for i,tweet in tqdm(enumerate(sntwitter.TwitterSearchScraper(query).get_items())): | |
if max_tweets != -1 and i >= int(max_tweets): | |
break | |
tweets_list.append([tweet.date, tweet.id, tweet.content]) | |
except KeyboardInterrupt: | |
print("Scraping berhenti atas permintaan pengguna") | |
df = pd.DataFrame(tweets_list, columns=['Datetime', 'Tweet Id', 'Text']) | |
df.to_csv(output_path, index=False) | |
print("Data tweet tersimpan di",output_path) | |
return df | |
def remove_unnecessary_char(text): | |
text = re.sub("\[USERNAME\]", " ", text) | |
text = re.sub("\[URL\]", " ", text) | |
text = re.sub("\[SENSITIVE-NO\]", " ", text) | |
text = re.sub(' +', ' ', text) | |
return text | |
def preprocess_tweet(text): | |
text = re.sub('\n',' ',text) # Remove every '\n' | |
# text = re.sub('rt',' ',text) # Remove every retweet symbol | |
text = re.sub('^(\@\w+ ?)+',' ',text) | |
text = re.sub(r'\@\w+',' ',text) # Remove every username | |
text = re.sub('((www\.[^\s]+)|(https?://[^\s]+)|(http?://[^\s]+))',' ',text) # Remove every URL | |
text = re.sub('/', ' ', text) | |
# text = re.sub(r'[^\w\s]', '', text) | |
text = re.sub(' +', ' ', text) # Remove extra spaces | |
return text | |
def remove_nonaplhanumeric(text): | |
text = re.sub('[^0-9a-zA-Z]+', ' ', text) | |
return text | |
def preprocess_text(text): | |
text = preprocess_tweet(text) | |
text = remove_unnecessary_char(text) | |
text = remove_nonaplhanumeric(text) | |
text = text.lower() | |
return text | |
predict = pipeline('text-classification', | |
model='karuniaperjuangan/smsa-distilbert-indo', | |
device=0 if torch.cuda.is_available() else -1) | |
def analyze_df_sentiment(df, batch_size): | |
text_list = list(df["Text"].astype(str).values) | |
text_list_batches = [text_list[i:i+batch_size] for i in range(0,len(text_list),batch_size)] # Memisahkan berdasar batch size dengan bantuan zip () | |
predictions = [] | |
for batch in tqdm(text_list_batches): | |
batch_predictions = predict(batch) | |
predictions += batch_predictions | |
df["Label"] = [pred["label"] for pred in predictions] | |
df["Score"] = [pred["score"] for pred in predictions] | |
return df | |
def keyword_analyzer(keyword, max_tweets, batch_size=16): | |
print("Scraping tweets...") | |
df = scrape_tweets(keyword, max_tweets=max_tweets) | |
df["Text"] = df["Text"].apply(preprocess_text) | |
print("Analyzing sentiment...") | |
df = analyze_df_sentiment(df, batch_size=batch_size) | |
fig = plt.figure() | |
df.groupby(["Label"])["Text"].count().plot.pie(autopct="%.1f%%", figsize=(6,6)) | |
return fig, df[["Text", "Label", "Score"]] | |
with gr.Blocks() as demo: | |
gr.Markdown("""<h1 style="text-align:center">Aplikasi Sentiment Analysis Keyword Twitter </h1>""") | |
gr.Markdown( | |
""" | |
Aplikasi ini digunakan untuk melakukan sentimen analisis terhadap data di Twitter menggunakan model DistilBERT. Terdapat 2 mode yang dapat digunakan: | |
1. Trend/Keyword: Untuk melakukan analisis terhadap semua tweet yang mengandung keyword yang diinputkan | |
2. Tweet: Untuk melakukan analisis terhadap sebuah tweet yang diinputkan | |
""" | |
) | |
with gr.Tab("Trend/Keyword"): | |
gr.Markdown("""Masukkan keyword dan jumlah maksimum tweet yang ingin diambil""") | |
with gr.Blocks(): | |
with gr.Row(): | |
with gr.Column(): | |
keyword_textbox = gr.Textbox(lines=1, label="Keyword") | |
max_tweets_component = gr.Number(value=-1, label="Tweet Maksimal yang akan discrape (-1 jika ingin mengscrape semua tweet)", precision=0) | |
batch_size_component = gr.Number(value=16, label="Batch Size (Semakin banyak semakin cepat, tetapi semakin boros memori)", precision=0) | |
button = gr.Button("Submit") | |
plot_component = gr.Plot(label="Pie Chart") | |
dataframe_component = gr.DataFrame(type="pandas", | |
label="Dataframe", | |
max_rows=(20,'fixed'), | |
overflow_row_behaviour='paginate', | |
wrap=True) | |
with gr.Tab("Single Tweet"): | |
gr.Interface(lambda Tweet: (predict(Tweet)[0]['label'], predict(Tweet)[0]['score']), | |
"textbox", | |
["label", "label"], | |
allow_flagging='never', | |
) | |
gr.Markdown( | |
""" | |
Space ini merupakan tugas NLP dari mata kuliah Pemrosesan Bahasa Alami yang diampu oleh Bapak Syukron Abu Ishaq Alfarozi. | |
## Anggota Kelompok | |
- Karunia Perjuangan Mustadl'afin - 20/456368/TK/50498 | |
- Pramudya Kusuma Hardika - 20/460558/TK/51147 | |
""" | |
) | |
button.click(keyword_analyzer, | |
inputs=[keyword_textbox, max_tweets_component, batch_size_component], | |
outputs=[plot_component, dataframe_component]) | |
demo.launch(inbrowser=True) |