Kayyyy27's picture
Update app.py
e0e20ce verified
import os
import copy
import streamlit as st
#Basic libraries
import pandas as pd
import numpy as np
#NLTK libraries
import nltk
import re
import string
from wordcloud import WordCloud,STOPWORDS
from langdetect import detect
import unicodedata
#Visualization libraries
from textblob import TextBlob
from plotly import tools
import plotly.graph_objs as go
from plotly.subplots import make_subplots
#Ignore warnings
import warnings
warnings.filterwarnings('ignore')
#Other miscellaneous libraries
from scipy import interp
from itertools import cycle
import cufflinks as cf
from collections import defaultdict
from collections import Counter
# # Use a pipeline as a high-level helper
from transformers import pipeline
sentiment_pipeline = pipeline("text-classification", model="Kayyyy27/fine-tuned-United_Airlines_Twitter_Sentiment_Analysis")
def clean_text(text):
# Make text lowercase
text = str(text).lower()
# Remove text in square brackets
text = re.sub('\[.*?\]', '', text)
# Remove links
text = re.sub('https?://\S+|www\.\S+', '', text)
# Remove HTML tags
text = re.sub('<.*?>+', '', text)
# Remove punctuation
text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
# Remove newlines
text = re.sub('\n', '', text)
# Remove words containing numbers
text = re.sub('\w*\d\w*', '', text)
return text
def remove_emojis(text):
# Split the sentence into individual characters
characters = [char for char in text]
# Iterate over each character and remove emojis
cleaned_text = ''
for char in characters:
if not any(char in range(0x1F600, 0x1F650) or char in range(0x1F300, 0x1F6FF) or char in range(0x2600, 0x26FF) for char in map(ord, char)):
cleaned_text += char
return cleaned_text
def filter_english_reviews(df):
# Function to detect language
def detect_language(text):
try:
language = detect(text)
return language == 'en' # Return True if language is English
except:
return False # Return False if language detection fails
# Filter non-English comments
df['is_english'] = df['text'].apply(detect_language)
df = df[df['is_english']]
return df
#custom function for ngram generation
def generate_ngrams(text, n_gram=2):
token = [token for token in text.lower().split(" ") if token != "" if token not in STOPWORDS]
ngrams = zip(*[token[i:] for i in range(n_gram)])
return [" ".join(ngram) for ngram in ngrams]
#custom function for horizontal bar chart
def horizontal_bar_chart(df, color):
trace = go.Bar(
y=df["word"].values[::-1],
x=df["wordcount"].values[::-1],
showlegend=False,
orientation = 'h',
marker=dict(
color=color,
),
)
return trace
def plot_bigram(text,file_name):
# Generate the bar chart from reviews
freq_dict = defaultdict(int)
sentiment_count = defaultdict(int)
for sentence in text:
result = sentiment_pipeline(sentence)
sentiment = result[0]["label"]
for word in generate_ngrams(sentence,2):
freq_dict[word] += 1
if sentiment == "LABEL_2":
sentiment_count[word] += 1
#count the number of sentences containing the word
postwithword_count = defaultdict(int)
for word in freq_dict.keys():
for line in text:
# if any(w in line for w in word.split()):
if word in line:
postwithword_count[word] += 1
fd_sorted = pd.DataFrame(sorted(freq_dict.items(), key=lambda x: x[1])[::-1])
fd_sorted.columns = ["word", "wordcount"]
# Add a new column "positive_sentiment_percentage" to the DataFrame
fd_sorted["positive_sentiment_percentage"] = fd_sorted["word"].map(sentiment_count)/fd_sorted["wordcount"]
# Check if the DataFrame is empty
if fd_sorted.empty:
st.title("No bigrams found.")
return # Return early if the DataFrame is empty
# trace0 = horizontal_bar_chart(fd_sorted.head(25), 'orange')
# # Create a subplot
# fig = make_subplots(rows=1, cols=1, subplot_titles=[file_name.split('_', 1)[0]])
# fig.add_trace(trace0, 1, 1)
# Create a horizontal bar chart trace for word count
trace_word_count = go.Bar(
x=fd_sorted["wordcount"].head(25),
y=fd_sorted["word"].head(25),
name="Word Count",
orientation="h"
)
# Create a horizontal bar chart trace for positive sentiment percentage
trace_positive_sentiment = go.Bar(
x=fd_sorted["positive_sentiment_percentage"].head(25),
y=fd_sorted["word"].head(25),
name="Positive Sentiment Percentage",
orientation="h"
)
# Create a subplot
fig = make_subplots(rows=1, cols=2, subplot_titles=["Word Count", "Positive Sentiment Percentage"])
fig.add_trace(trace_word_count, 1, 1)
fig.add_trace(trace_positive_sentiment, 1, 2)
# Remove y-axis of the second plot
fig.update_yaxes(showticklabels=False, row=1, col=2)
fig.update_layout(height=900, width=1000, title="High Frequency Words")
st.plotly_chart(fig, use_container_width=True)
def process_reviews(raw_reviews, year, month, filename):
# Set the index to 'Post_creation_date'
raw_reviews.set_index('Post_creation_date', inplace=True)
# Convert the index to a DatetimeIndex object
raw_reviews.index = pd.to_datetime(raw_reviews.index)
# Sort the DataFrame by index
raw_reviews = raw_reviews.sort_index()
# Clean the text
raw_reviews['text'] = raw_reviews['text'].apply(clean_text)
raw_reviews['text'] = raw_reviews['text'].apply(remove_emojis)
stop_words= ['yourselves', 'between', 'whom', 'itself', 'is', "she's", 'up', 'herself', 'here', 'your', 'each',
'we', 'he', 'my', "you've", 'having', 'in', 'both', 'for', 'themselves', 'are', 'them', 'other',
'and', 'an', 'during', 'their', 'can', 'yourself', 'she', 'until', 'so', 'these', 'ours', 'above',
'what', 'while', 'have', 're', 'more', 'only', "needn't", 'when', 'just', 'that', 'were', "don't",
'very', 'should', 'any', 'y', 'isn', 'who', 'a', 'they', 'to', 'too', "should've", 'has', 'before',
'into', 'yours', "it's", 'do', 'against', 'on', 'now', 'her', 've', 'd', 'by', 'am', 'from',
'about', 'further', "that'll", "you'd", 'you', 'as', 'how', 'been', 'the', 'or', 'doing', 'such',
'his', 'himself', 'ourselves', 'was', 'through', 'out', 'below', 'own', 'myself', 'theirs',
'me', 'why', 'once', 'him', 'than', 'be', 'most', "you'll", 'same', 'some', 'with', 'few', 'it',
'at', 'after', 'its', 'which', 'there','our', 'this', 'hers', 'being', 'did', 'of', 'had', 'under',
'over','again', 'where', 'those', 'then', "you're", 'i', 'because', 'does', 'all','link','join','bio','us','we',
'myself','we','you','de','la','link','bio']
raw_reviews['text'] = raw_reviews['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
# Filter English reviews
processed_reviews = filter_english_reviews(raw_reviews)
# Filter the reviews based on the specified year and quarter
filtered_reviews = raw_reviews[(raw_reviews.index.year == year) & (raw_reviews.index.month == month)]
if filtered_reviews.empty:
st.title("No available data")
return # Return early if the DataFrame is empty
filtered_reviews['text'] = filtered_reviews['text'].str[:500]
text_list = filtered_reviews['text'].tolist()
text_list_truncated = [text[:500] for text in text_list] # Truncate the input sequences to 500 tokens
#plot bigram
plot_bigram(text_list_truncated,filename)
#set header
st.header(":rainbow[Instagram Hashtag Analytics] ", divider='rainbow')
#import file
file = st.sidebar.file_uploader("Import File")
year = st.sidebar.selectbox('Year', options=['2023', '2024'], index=1)
month = st.sidebar.selectbox('Month', options=['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12'], index=0)
if file:
#get data
@st.cache_data
def load_data(path):
df = pd.read_csv(path)
# df.columns = df.columns.str.lower()
return df
df = load_data(file)
process_reviews(df, int(year), int(month),file.name)
else:
st.write("Please upload your hashtag file")