tugas / script /text_proc.py
wudapp's picture
Duplicate from dafqi/indo_twitter_sentiment_app
a2720a3
raw
history blame contribute delete
No virus
4.2 kB
import pandas as pd
import numpy as np
from PIL import Image
import plotly.express as px
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import string
import re #regex library
#umap
import umap
import hdbscan
import plotly.graph_objects as go
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
# import word_tokenize from NLTK
from transformers import AutoTokenizer
from script.plotting import visualize_barchart
def load_stopwords():
stopwords = pd.read_csv("assets/stopwordbahasa.csv", header=None)
stopwords = stopwords[0].tolist()
stopwords = stopwords + list(string.punctuation)
return stopwords
def tokenisasi(df):
stopwords = load_stopwords()
tokenizer = AutoTokenizer.from_pretrained('indobert')
tokens = df.content.apply(lambda x: tokenizer.tokenize(x))
tokens = tokens.apply(lambda x: [x for x in x if (not x.startswith('##') and x not in stopwords and len(x) > 4)])
return tokens
def get_wordcloud(df,kelas_sentiment):
mask = np.array(Image.open('./assets/twitter.png'))
cmap_dict = {'positif': 'YlGn', 'negatif': 'OrRd', 'netral': 'GnBu'}
tokens = tokenisasi(df[df.sentiment == kelas_sentiment])
tokens = tokens.apply(lambda x: ' '.join(x))
text = ' '.join(tokens)
# check if text empty or not
try :
wordcloud = WordCloud(width = 800, height = 800,
background_color ='black',
min_font_size = 10,
colormap = cmap_dict[kelas_sentiment],
mask = mask).generate(text)
except:
wordcloud = WordCloud(width = 800, height = 800,
background_color ='black',
min_font_size = 10,
colormap = cmap_dict[kelas_sentiment],
mask = mask).generate("None")
return wordcloud
def plot_text(df,kelas,embedding_model):
df = df[df.sentiment == kelas]
data = embedding_model.encode(df.values.tolist())
umap_model = umap.UMAP(n_neighbors=min(df.shape[0],5),random_state = 42)
umap_data = umap_model.fit_transform(data)
clusterer = hdbscan.HDBSCAN(min_cluster_size=round((df.shape[0])**(0.5)-1),min_samples=3)
clusterer.fit(umap_data)
labels = ['cluster ' + str(i) for i in clusterer.labels_]
# replace cluster -1 with outlier
labels = ["outlier" if i == "cluster -1" else i for i in labels ]
text = df["content"].str.wrap(50).apply(lambda x: x.replace('\n', '<br>'))
fig = px.scatter(x=umap_data[:,0], y=umap_data[:,1],color = clusterer.labels_)
# remove legend
fig = px.scatter(x=umap_data[:,0], y=umap_data[:,1],color = labels,text = text)
#set text color
fig.update_traces(textfont_color='rgba(0,0,0,0)',marker_size = 8)
# set background color
fig.update_layout(plot_bgcolor='rgba(0,0,0,0)')
# set margin
fig.update_layout(margin=dict(l=40, r=5, t=0, b=40))
# set axis color to grey
fig.update_xaxes(showgrid=False, zeroline=False, linecolor='rgb(200,200,200)')
fig.update_yaxes( zeroline=False, linecolor='rgb(200,200,200)')
# set font sans-serif
fig.update_layout(font_family="sans-serif")
# remove legend
fig.update_layout(showlegend=False)
# set legend title to cluster
return df["content"],data,fig
def topic_modelling(df,embed_df):
data = df.apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
stopwords = load_stopwords()
# remove empty data
topic_model = BERTopic(
calculate_probabilities=True,
# cluster model
hdbscan_model = hdbscan.HDBSCAN(min_cluster_size=5,prediction_data=True),
vectorizer_model=CountVectorizer(stop_words=stopwords),
language="indonesian",
)
topics, probs = topic_model.fit_transform(data,embed_df)
topic_labels = topic_model.generate_topic_labels(
topic_prefix = False,
separator = ", ",
)
topic_model.set_topic_labels(topic_labels)
fig = visualize_barchart(topic_model)
# set title to Kata Kunci tiap Topic
# fig.update_layout(title_text="Topic yang sering muncul")
return fig,topic_model