File size: 4,199 Bytes
a2720a3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 |
import pandas as pd
import numpy as np
from PIL import Image
import plotly.express as px
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import string
import re #regex library
#umap
import umap
import hdbscan
import plotly.graph_objects as go
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
# import word_tokenize from NLTK
from transformers import AutoTokenizer
from script.plotting import visualize_barchart
def load_stopwords():
stopwords = pd.read_csv("assets/stopwordbahasa.csv", header=None)
stopwords = stopwords[0].tolist()
stopwords = stopwords + list(string.punctuation)
return stopwords
def tokenisasi(df):
stopwords = load_stopwords()
tokenizer = AutoTokenizer.from_pretrained('indobert')
tokens = df.content.apply(lambda x: tokenizer.tokenize(x))
tokens = tokens.apply(lambda x: [x for x in x if (not x.startswith('##') and x not in stopwords and len(x) > 4)])
return tokens
def get_wordcloud(df,kelas_sentiment):
mask = np.array(Image.open('./assets/twitter.png'))
cmap_dict = {'positif': 'YlGn', 'negatif': 'OrRd', 'netral': 'GnBu'}
tokens = tokenisasi(df[df.sentiment == kelas_sentiment])
tokens = tokens.apply(lambda x: ' '.join(x))
text = ' '.join(tokens)
# check if text empty or not
try :
wordcloud = WordCloud(width = 800, height = 800,
background_color ='black',
min_font_size = 10,
colormap = cmap_dict[kelas_sentiment],
mask = mask).generate(text)
except:
wordcloud = WordCloud(width = 800, height = 800,
background_color ='black',
min_font_size = 10,
colormap = cmap_dict[kelas_sentiment],
mask = mask).generate("None")
return wordcloud
def plot_text(df,kelas,embedding_model):
df = df[df.sentiment == kelas]
data = embedding_model.encode(df.values.tolist())
umap_model = umap.UMAP(n_neighbors=min(df.shape[0],5),random_state = 42)
umap_data = umap_model.fit_transform(data)
clusterer = hdbscan.HDBSCAN(min_cluster_size=round((df.shape[0])**(0.5)-1),min_samples=3)
clusterer.fit(umap_data)
labels = ['cluster ' + str(i) for i in clusterer.labels_]
# replace cluster -1 with outlier
labels = ["outlier" if i == "cluster -1" else i for i in labels ]
text = df["content"].str.wrap(50).apply(lambda x: x.replace('\n', '<br>'))
fig = px.scatter(x=umap_data[:,0], y=umap_data[:,1],color = clusterer.labels_)
# remove legend
fig = px.scatter(x=umap_data[:,0], y=umap_data[:,1],color = labels,text = text)
#set text color
fig.update_traces(textfont_color='rgba(0,0,0,0)',marker_size = 8)
# set background color
fig.update_layout(plot_bgcolor='rgba(0,0,0,0)')
# set margin
fig.update_layout(margin=dict(l=40, r=5, t=0, b=40))
# set axis color to grey
fig.update_xaxes(showgrid=False, zeroline=False, linecolor='rgb(200,200,200)')
fig.update_yaxes( zeroline=False, linecolor='rgb(200,200,200)')
# set font sans-serif
fig.update_layout(font_family="sans-serif")
# remove legend
fig.update_layout(showlegend=False)
# set legend title to cluster
return df["content"],data,fig
def topic_modelling(df,embed_df):
data = df.apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
stopwords = load_stopwords()
# remove empty data
topic_model = BERTopic(
calculate_probabilities=True,
# cluster model
hdbscan_model = hdbscan.HDBSCAN(min_cluster_size=5,prediction_data=True),
vectorizer_model=CountVectorizer(stop_words=stopwords),
language="indonesian",
)
topics, probs = topic_model.fit_transform(data,embed_df)
topic_labels = topic_model.generate_topic_labels(
topic_prefix = False,
separator = ", ",
)
topic_model.set_topic_labels(topic_labels)
fig = visualize_barchart(topic_model)
# set title to Kata Kunci tiap Topic
# fig.update_layout(title_text="Topic yang sering muncul")
return fig,topic_model |