tugas

Runtime error

App Files Files Community

tugas / script /text_proc.py

wudapp

Duplicate from dafqi/indo_twitter_sentiment_app

a2720a3 over 2 years ago

raw

history blame contribute delete

4.2 kB

	import pandas as pd
	import numpy as np
	from PIL import Image
	import plotly.express as px
	from wordcloud import WordCloud
	import matplotlib.pyplot as plt
	import string
	import re #regex library
	#umap
	import umap
	import hdbscan
	import plotly.graph_objects as go
	from bertopic import BERTopic
	from sklearn.feature_extraction.text import CountVectorizer

	# import word_tokenize from NLTK
	from transformers import AutoTokenizer
	from script.plotting import visualize_barchart

	def load_stopwords():
	stopwords = pd.read_csv("assets/stopwordbahasa.csv", header=None)
	stopwords = stopwords[0].tolist()
	stopwords = stopwords + list(string.punctuation)
	return stopwords

	def tokenisasi(df):
	stopwords = load_stopwords()
	tokenizer = AutoTokenizer.from_pretrained('indobert')
	tokens = df.content.apply(lambda x: tokenizer.tokenize(x))
	tokens = tokens.apply(lambda x: [x for x in x if (not x.startswith('##') and x not in stopwords and len(x) > 4)])
	return tokens

	def get_wordcloud(df,kelas_sentiment):
	mask = np.array(Image.open('./assets/twitter.png'))
	cmap_dict = {'positif': 'YlGn', 'negatif': 'OrRd', 'netral': 'GnBu'}
	tokens = tokenisasi(df[df.sentiment == kelas_sentiment])
	tokens = tokens.apply(lambda x: ' '.join(x))
	text = ' '.join(tokens)
	# check if text empty or not
	try :
	wordcloud = WordCloud(width = 800, height = 800,
	background_color ='black',
	min_font_size = 10,
	colormap = cmap_dict[kelas_sentiment],
	mask = mask).generate(text)
	except:
	wordcloud = WordCloud(width = 800, height = 800,
	background_color ='black',
	min_font_size = 10,
	colormap = cmap_dict[kelas_sentiment],
	mask = mask).generate("None")
	return wordcloud

	def plot_text(df,kelas,embedding_model):
	df = df[df.sentiment == kelas]
	data = embedding_model.encode(df.values.tolist())
	umap_model = umap.UMAP(n_neighbors=min(df.shape[0],5),random_state = 42)
	umap_data = umap_model.fit_transform(data)
	clusterer = hdbscan.HDBSCAN(min_cluster_size=round((df.shape[0])**(0.5)-1),min_samples=3)
	clusterer.fit(umap_data)

	labels = ['cluster ' + str(i) for i in clusterer.labels_]
	# replace cluster -1 with outlier
	labels = ["outlier" if i == "cluster -1" else i for i in labels ]
	text = df["content"].str.wrap(50).apply(lambda x: x.replace('\n', '<br>'))

	fig = px.scatter(x=umap_data[:,0], y=umap_data[:,1],color = clusterer.labels_)
	# remove legend
	fig = px.scatter(x=umap_data[:,0], y=umap_data[:,1],color = labels,text = text)
	#set text color
	fig.update_traces(textfont_color='rgba(0,0,0,0)',marker_size = 8)
	# set background color
	fig.update_layout(plot_bgcolor='rgba(0,0,0,0)')
	# set margin
	fig.update_layout(margin=dict(l=40, r=5, t=0, b=40))
	# set axis color to grey
	fig.update_xaxes(showgrid=False, zeroline=False, linecolor='rgb(200,200,200)')
	fig.update_yaxes( zeroline=False, linecolor='rgb(200,200,200)')
	# set font sans-serif
	fig.update_layout(font_family="sans-serif")
	# remove legend
	fig.update_layout(showlegend=False)

	# set legend title to cluster
	return df["content"],data,fig

	def topic_modelling(df,embed_df):
	data = df.apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
	stopwords = load_stopwords()
	# remove empty data
	topic_model = BERTopic(
	calculate_probabilities=True,
	# cluster model
	hdbscan_model = hdbscan.HDBSCAN(min_cluster_size=5,prediction_data=True),
	vectorizer_model=CountVectorizer(stop_words=stopwords),
	language="indonesian",
	)
	topics, probs = topic_model.fit_transform(data,embed_df)
	topic_labels = topic_model.generate_topic_labels(
	topic_prefix = False,
	separator = ", ",
	)
	topic_model.set_topic_labels(topic_labels)
	fig = visualize_barchart(topic_model)
	# set title to Kata Kunci tiap Topic
	# fig.update_layout(title_text="Topic yang sering muncul")
	return fig,topic_model