Spaces:

Kayyyy27
/

sentiment_analysis

Sleeping

App Files Files Community

sentiment_analysis / app.py

Kayyyy27

Update app.py

e0e20ce verified about 1 year ago

raw

history blame contribute delete

8.54 kB

	import os
	import copy

	import streamlit as st
	#Basic libraries
	import pandas as pd
	import numpy as np

	#NLTK libraries
	import nltk
	import re
	import string
	from wordcloud import WordCloud,STOPWORDS
	from langdetect import detect
	import unicodedata

	#Visualization libraries
	from textblob import TextBlob
	from plotly import tools
	import plotly.graph_objs as go
	from plotly.subplots import make_subplots

	#Ignore warnings
	import warnings
	warnings.filterwarnings('ignore')

	#Other miscellaneous libraries
	from scipy import interp
	from itertools import cycle
	import cufflinks as cf
	from collections import defaultdict
	from collections import Counter

	# # Use a pipeline as a high-level helper
	from transformers import pipeline
	sentiment_pipeline = pipeline("text-classification", model="Kayyyy27/fine-tuned-United_Airlines_Twitter_Sentiment_Analysis")

	def clean_text(text):
	# Make text lowercase
	text = str(text).lower()

	# Remove text in square brackets
	text = re.sub('\[.*?\]', '', text)

	# Remove links
	text = re.sub('https?://\S+\|www\.\S+', '', text)

	# Remove HTML tags
	text = re.sub('<.*?>+', '', text)

	# Remove punctuation
	text = re.sub('[%s]' % re.escape(string.punctuation), '', text)

	# Remove newlines
	text = re.sub('\n', '', text)

	# Remove words containing numbers
	text = re.sub('\w\d\w', '', text)

	return text

	def remove_emojis(text):
	# Split the sentence into individual characters
	characters = [char for char in text]

	# Iterate over each character and remove emojis
	cleaned_text = ''
	for char in characters:
	if not any(char in range(0x1F600, 0x1F650) or char in range(0x1F300, 0x1F6FF) or char in range(0x2600, 0x26FF) for char in map(ord, char)):
	cleaned_text += char

	return cleaned_text

	def filter_english_reviews(df):
	# Function to detect language
	def detect_language(text):
	try:
	language = detect(text)
	return language == 'en' # Return True if language is English
	except:
	return False # Return False if language detection fails

	# Filter non-English comments
	df['is_english'] = df['text'].apply(detect_language)
	df = df[df['is_english']]

	return df

	#custom function for ngram generation
	def generate_ngrams(text, n_gram=2):
	token = [token for token in text.lower().split(" ") if token != "" if token not in STOPWORDS]
	ngrams = zip(*[token[i:] for i in range(n_gram)])
	return [" ".join(ngram) for ngram in ngrams]

	#custom function for horizontal bar chart
	def horizontal_bar_chart(df, color):
	trace = go.Bar(
	y=df["word"].values[::-1],
	x=df["wordcount"].values[::-1],
	showlegend=False,
	orientation = 'h',
	marker=dict(
	color=color,
	),
	)
	return trace

	def plot_bigram(text,file_name):
	# Generate the bar chart from reviews
	freq_dict = defaultdict(int)
	sentiment_count = defaultdict(int)
	for sentence in text:
	result = sentiment_pipeline(sentence)
	sentiment = result[0]["label"]
	for word in generate_ngrams(sentence,2):
	freq_dict[word] += 1
	if sentiment == "LABEL_2":
	sentiment_count[word] += 1

	#count the number of sentences containing the word
	postwithword_count = defaultdict(int)
	for word in freq_dict.keys():
	for line in text:
	# if any(w in line for w in word.split()):
	if word in line:
	postwithword_count[word] += 1



	fd_sorted = pd.DataFrame(sorted(freq_dict.items(), key=lambda x: x[1])[::-1])
	fd_sorted.columns = ["word", "wordcount"]

	# Add a new column "positive_sentiment_percentage" to the DataFrame
	fd_sorted["positive_sentiment_percentage"] = fd_sorted["word"].map(sentiment_count)/fd_sorted["wordcount"]


	# Check if the DataFrame is empty
	if fd_sorted.empty:
	st.title("No bigrams found.")
	return # Return early if the DataFrame is empty

	# trace0 = horizontal_bar_chart(fd_sorted.head(25), 'orange')
	# # Create a subplot
	# fig = make_subplots(rows=1, cols=1, subplot_titles=[file_name.split('_', 1)[0]])
	# fig.add_trace(trace0, 1, 1)

	# Create a horizontal bar chart trace for word count
	trace_word_count = go.Bar(
	x=fd_sorted["wordcount"].head(25),
	y=fd_sorted["word"].head(25),
	name="Word Count",
	orientation="h"
	)

	# Create a horizontal bar chart trace for positive sentiment percentage
	trace_positive_sentiment = go.Bar(
	x=fd_sorted["positive_sentiment_percentage"].head(25),
	y=fd_sorted["word"].head(25),
	name="Positive Sentiment Percentage",
	orientation="h"
	)

	# Create a subplot
	fig = make_subplots(rows=1, cols=2, subplot_titles=["Word Count", "Positive Sentiment Percentage"])
	fig.add_trace(trace_word_count, 1, 1)
	fig.add_trace(trace_positive_sentiment, 1, 2)

	# Remove y-axis of the second plot
	fig.update_yaxes(showticklabels=False, row=1, col=2)

	fig.update_layout(height=900, width=1000, title="High Frequency Words")
	st.plotly_chart(fig, use_container_width=True)





	def process_reviews(raw_reviews, year, month, filename):

	# Set the index to 'Post_creation_date'
	raw_reviews.set_index('Post_creation_date', inplace=True)

	# Convert the index to a DatetimeIndex object
	raw_reviews.index = pd.to_datetime(raw_reviews.index)

	# Sort the DataFrame by index
	raw_reviews = raw_reviews.sort_index()


	# Clean the text
	raw_reviews['text'] = raw_reviews['text'].apply(clean_text)
	raw_reviews['text'] = raw_reviews['text'].apply(remove_emojis)
	stop_words= ['yourselves', 'between', 'whom', 'itself', 'is', "she's", 'up', 'herself', 'here', 'your', 'each',
	'we', 'he', 'my', "you've", 'having', 'in', 'both', 'for', 'themselves', 'are', 'them', 'other',
	'and', 'an', 'during', 'their', 'can', 'yourself', 'she', 'until', 'so', 'these', 'ours', 'above',
	'what', 'while', 'have', 're', 'more', 'only', "needn't", 'when', 'just', 'that', 'were', "don't",
	'very', 'should', 'any', 'y', 'isn', 'who', 'a', 'they', 'to', 'too', "should've", 'has', 'before',
	'into', 'yours', "it's", 'do', 'against', 'on', 'now', 'her', 've', 'd', 'by', 'am', 'from',
	'about', 'further', "that'll", "you'd", 'you', 'as', 'how', 'been', 'the', 'or', 'doing', 'such',
	'his', 'himself', 'ourselves', 'was', 'through', 'out', 'below', 'own', 'myself', 'theirs',
	'me', 'why', 'once', 'him', 'than', 'be', 'most', "you'll", 'same', 'some', 'with', 'few', 'it',
	'at', 'after', 'its', 'which', 'there','our', 'this', 'hers', 'being', 'did', 'of', 'had', 'under',
	'over','again', 'where', 'those', 'then', "you're", 'i', 'because', 'does', 'all','link','join','bio','us','we',
	'myself','we','you','de','la','link','bio']
	raw_reviews['text'] = raw_reviews['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

	# Filter English reviews
	processed_reviews = filter_english_reviews(raw_reviews)

	# Filter the reviews based on the specified year and quarter
	filtered_reviews = raw_reviews[(raw_reviews.index.year == year) & (raw_reviews.index.month == month)]

	if filtered_reviews.empty:
	st.title("No available data")
	return # Return early if the DataFrame is empty

	filtered_reviews['text'] = filtered_reviews['text'].str[:500]
	text_list = filtered_reviews['text'].tolist()
	text_list_truncated = [text[:500] for text in text_list] # Truncate the input sequences to 500 tokens


	#plot bigram
	plot_bigram(text_list_truncated,filename)

	#set header
	st.header(":rainbow[Instagram Hashtag Analytics] ", divider='rainbow')
	#import file
	file = st.sidebar.file_uploader("Import File")
	year = st.sidebar.selectbox('Year', options=['2023', '2024'], index=1)
	month = st.sidebar.selectbox('Month', options=['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12'], index=0)

	if file:

	#get data
	@st.cache_data
	def load_data(path):
	df = pd.read_csv(path)
	# df.columns = df.columns.str.lower()
	return df

	df = load_data(file)
	process_reviews(df, int(year), int(month),file.name)


	else:

	st.write("Please upload your hashtag file")