|
import pandas as pd |
|
import numpy as np |
|
import re |
|
import snscrape.modules.twitter as sntwitter |
|
from transformers import pipeline |
|
import plotly.express as px |
|
import joblib |
|
from sklearn.metrics import classification_report, confusion_matrix |
|
|
|
|
|
import nltk |
|
|
|
nltk.download("punkt") |
|
nltk.download("stopwords") |
|
from nltk.tokenize import word_tokenize |
|
|
|
|
|
import tweepy |
|
|
|
|
|
CONSUMER_KEY = "B1Bf8lgml7EJTjm9M7D9NQjRQ" |
|
CONSUMER_SECRET = "f320J9Os5AfDZrlFsAmsPpBRxvHtByXeZq3gOiSzAYNtpdyR5m" |
|
ACCESS_TOKEN = "1490990154647216130-zR8SAKAWGqH7VSa0dneVue3XSDOWof" |
|
ACCESS_TOKEN_SECRET = "E3xaLMwxm7H4sSDfxzdZl8TJ9LcLe7wEUWG8gvUGyUkfX" |
|
|
|
|
|
def get_tweets(username, length=10, option=None): |
|
|
|
query = username + " -filter:links filter:replies lang:id" |
|
if option == "Advanced": |
|
query = username |
|
tweets = [] |
|
|
|
|
|
auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET) |
|
auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET) |
|
api = tweepy.API(auth) |
|
|
|
|
|
for i, tweet in enumerate(tweepy.Cursor(api.search_tweets, q=query).items(length)): |
|
tweets.append([tweet.text]) |
|
|
|
|
|
tweets_df = pd.DataFrame(tweets, columns=["content"]) |
|
tweets_df["content"] = tweets_df["content"].str.replace("@[^\s]+", "") |
|
tweets_df["content"] = tweets_df["content"].str.replace("#[^\s]+", "") |
|
tweets_df["content"] = tweets_df["content"].str.replace("http\S+", "") |
|
tweets_df["content"] = tweets_df["content"].str.replace("pic.twitter.com\S+", "") |
|
tweets_df["content"] = tweets_df["content"].str.replace("RT", "") |
|
tweets_df["content"] = tweets_df["content"].str.replace("amp", "") |
|
|
|
tweets_df["content"] = tweets_df["content"].str.replace( |
|
"[^\w\s#@/:%.,_-]", "", flags=re.UNICODE |
|
) |
|
|
|
|
|
tweets_df["content"] = tweets_df["content"].str.strip() |
|
|
|
|
|
tweets_df["content"] = tweets_df["content"].str.replace("\s+", " ") |
|
|
|
|
|
tweets_df = tweets_df[tweets_df["content"] != ""] |
|
return tweets_df |
|
|
|
|
|
def get_sentiment(df, option_model): |
|
id2label = {0: "negatif", 1: "netral", 2: "positif"} |
|
if option_model == "IndoBERT (Accurate,Slow)": |
|
classifier = pipeline("sentiment-analysis", model="indobert") |
|
df["sentiment"] = df["content"].apply( |
|
lambda x: id2label[classifier(x)[0]["label"]] |
|
) |
|
elif option_model == "Logistic Regression (Less Accurate,Fast)": |
|
df_model = joblib.load("assets/df_model.pkl") |
|
classifier = df_model[ |
|
df_model.model_name == "Logistic Regression" |
|
].model.values[0] |
|
df["sentiment"] = df["content"].apply( |
|
lambda x: id2label[classifier.predict([x])[0]] |
|
) |
|
else: |
|
df_model = joblib.load("assets/df_model.pkl") |
|
classifier = df_model[df_model.model_name == option_model].model.values[0] |
|
df["sentiment"] = df["content"].apply( |
|
lambda x: id2label[classifier.predict([x])[0]] |
|
) |
|
|
|
cols = df.columns.tolist() |
|
cols = cols[-1:] + cols[:-1] |
|
df = df[cols] |
|
|
|
return df |
|
|
|
|
|
def get_bar_chart(df): |
|
df = df.groupby(["sentiment"]).count().reset_index() |
|
|
|
|
|
fig = px.bar( |
|
df, |
|
x="sentiment", |
|
y="content", |
|
color="sentiment", |
|
text="content", |
|
color_discrete_map={ |
|
"positif": "#00cc96", |
|
"negatif": "#ef553b", |
|
"netral": "#636efa", |
|
}, |
|
) |
|
|
|
fig.update_layout(showlegend=False) |
|
|
|
fig.update_layout(margin=dict(t=0, b=150, l=0, r=0)) |
|
|
|
|
|
fig.update_traces(textposition="outside") |
|
fig.update_layout(uniformtext_minsize=8, uniformtext_mode="hide") |
|
|
|
|
|
fig.update_yaxes(title_text="Jumlah Komentar") |
|
|
|
return fig |
|
|
|
|
|
def plot_model_summary(df_model): |
|
df_scatter = df_model[df_model.set_data == "test"][["score", "time", "model_name"]] |
|
|
|
fig = px.scatter( |
|
df_scatter, x="time", y="score", color="model_name", hover_data=["model_name"] |
|
) |
|
|
|
fig.update_xaxes(title_text="time (s)") |
|
|
|
fig.update_yaxes(title_text="accuracy") |
|
|
|
|
|
fig.update_traces(marker=dict(size=10)) |
|
fig.update_layout(autosize=False, margin=dict(t=0, l=0, r=0), height=400) |
|
return fig |
|
|
|
|
|
def plot_clfr(df_model, option_model, df): |
|
df_clfr = pd.DataFrame( |
|
classification_report(df["label"], df[f"{option_model}_pred"], output_dict=True) |
|
) |
|
|
|
df_clfr.columns = [ |
|
"positif", |
|
"netral", |
|
"negatif", |
|
"accuracy", |
|
"macro_avg", |
|
"weighted_avg", |
|
] |
|
fig = px.imshow( |
|
df_clfr.T.iloc[:, :-1], |
|
x=df_clfr.T.iloc[:, :-1].columns, |
|
y=df_clfr.T.iloc[:, :-1].index, |
|
) |
|
|
|
fig.update_layout(coloraxis_showscale=False) |
|
fig.update_layout(coloraxis_colorscale="gnbu") |
|
|
|
annot = df_clfr.T.iloc[:, :-1].values |
|
|
|
fig.update_traces(text=annot, texttemplate="%{text:.2f}", textfont_size=12) |
|
|
|
fig.update_layout(title_text="π Classification Report") |
|
return fig |
|
|
|
|
|
def plot_confusion_matrix(df_model, option_model, df): |
|
|
|
cm = confusion_matrix(df["label"], df[f"{option_model}_pred"]) |
|
fig = px.imshow( |
|
cm, x=["negatif", "netral", "positif"], y=["negatif", "netral", "positif"] |
|
) |
|
|
|
fig.update_layout(coloraxis_showscale=False) |
|
fig.update_layout(coloraxis_colorscale="gnbu", title_text="π Confusion Matrix") |
|
|
|
annot = cm |
|
|
|
fig.update_traces(text=annot, texttemplate="%{text:.0f}", textfont_size=15) |
|
return fig |
|
|