Spaces:
Sleeping
Sleeping
import os | |
import copy | |
import streamlit as st | |
#Basic libraries | |
import pandas as pd | |
import numpy as np | |
#NLTK libraries | |
import nltk | |
import re | |
import string | |
from wordcloud import WordCloud,STOPWORDS | |
from langdetect import detect | |
import unicodedata | |
#Visualization libraries | |
from textblob import TextBlob | |
from plotly import tools | |
import plotly.graph_objs as go | |
from plotly.subplots import make_subplots | |
#Ignore warnings | |
import warnings | |
warnings.filterwarnings('ignore') | |
#Other miscellaneous libraries | |
from scipy import interp | |
from itertools import cycle | |
import cufflinks as cf | |
from collections import defaultdict | |
from collections import Counter | |
# # Use a pipeline as a high-level helper | |
from transformers import pipeline | |
sentiment_pipeline = pipeline("text-classification", model="Kayyyy27/fine-tuned-United_Airlines_Twitter_Sentiment_Analysis") | |
def clean_text(text): | |
# Make text lowercase | |
text = str(text).lower() | |
# Remove text in square brackets | |
text = re.sub('\[.*?\]', '', text) | |
# Remove links | |
text = re.sub('https?://\S+|www\.\S+', '', text) | |
# Remove HTML tags | |
text = re.sub('<.*?>+', '', text) | |
# Remove punctuation | |
text = re.sub('[%s]' % re.escape(string.punctuation), '', text) | |
# Remove newlines | |
text = re.sub('\n', '', text) | |
# Remove words containing numbers | |
text = re.sub('\w*\d\w*', '', text) | |
return text | |
def remove_emojis(text): | |
# Split the sentence into individual characters | |
characters = [char for char in text] | |
# Iterate over each character and remove emojis | |
cleaned_text = '' | |
for char in characters: | |
if not any(char in range(0x1F600, 0x1F650) or char in range(0x1F300, 0x1F6FF) or char in range(0x2600, 0x26FF) for char in map(ord, char)): | |
cleaned_text += char | |
return cleaned_text | |
def filter_english_reviews(df): | |
# Function to detect language | |
def detect_language(text): | |
try: | |
language = detect(text) | |
return language == 'en' # Return True if language is English | |
except: | |
return False # Return False if language detection fails | |
# Filter non-English comments | |
df['is_english'] = df['text'].apply(detect_language) | |
df = df[df['is_english']] | |
return df | |
#custom function for ngram generation | |
def generate_ngrams(text, n_gram=2): | |
token = [token for token in text.lower().split(" ") if token != "" if token not in STOPWORDS] | |
ngrams = zip(*[token[i:] for i in range(n_gram)]) | |
return [" ".join(ngram) for ngram in ngrams] | |
#custom function for horizontal bar chart | |
def horizontal_bar_chart(df, color): | |
trace = go.Bar( | |
y=df["word"].values[::-1], | |
x=df["wordcount"].values[::-1], | |
showlegend=False, | |
orientation = 'h', | |
marker=dict( | |
color=color, | |
), | |
) | |
return trace | |
def plot_bigram(text,file_name): | |
# Generate the bar chart from reviews | |
freq_dict = defaultdict(int) | |
sentiment_count = defaultdict(int) | |
for sentence in text: | |
result = sentiment_pipeline(sentence) | |
sentiment = result[0]["label"] | |
for word in generate_ngrams(sentence,2): | |
freq_dict[word] += 1 | |
if sentiment == "LABEL_2": | |
sentiment_count[word] += 1 | |
#count the number of sentences containing the word | |
postwithword_count = defaultdict(int) | |
for word in freq_dict.keys(): | |
for line in text: | |
# if any(w in line for w in word.split()): | |
if word in line: | |
postwithword_count[word] += 1 | |
fd_sorted = pd.DataFrame(sorted(freq_dict.items(), key=lambda x: x[1])[::-1]) | |
fd_sorted.columns = ["word", "wordcount"] | |
# Add a new column "positive_sentiment_percentage" to the DataFrame | |
fd_sorted["positive_sentiment_percentage"] = fd_sorted["word"].map(sentiment_count)/fd_sorted["wordcount"] | |
# Check if the DataFrame is empty | |
if fd_sorted.empty: | |
st.title("No bigrams found.") | |
return # Return early if the DataFrame is empty | |
# trace0 = horizontal_bar_chart(fd_sorted.head(25), 'orange') | |
# # Create a subplot | |
# fig = make_subplots(rows=1, cols=1, subplot_titles=[file_name.split('_', 1)[0]]) | |
# fig.add_trace(trace0, 1, 1) | |
# Create a horizontal bar chart trace for word count | |
trace_word_count = go.Bar( | |
x=fd_sorted["wordcount"].head(25), | |
y=fd_sorted["word"].head(25), | |
name="Word Count", | |
orientation="h" | |
) | |
# Create a horizontal bar chart trace for positive sentiment percentage | |
trace_positive_sentiment = go.Bar( | |
x=fd_sorted["positive_sentiment_percentage"].head(25), | |
y=fd_sorted["word"].head(25), | |
name="Positive Sentiment Percentage", | |
orientation="h" | |
) | |
# Create a subplot | |
fig = make_subplots(rows=1, cols=2, subplot_titles=["Word Count", "Positive Sentiment Percentage"]) | |
fig.add_trace(trace_word_count, 1, 1) | |
fig.add_trace(trace_positive_sentiment, 1, 2) | |
# Remove y-axis of the second plot | |
fig.update_yaxes(showticklabels=False, row=1, col=2) | |
fig.update_layout(height=900, width=1000, title="High Frequency Words") | |
st.plotly_chart(fig, use_container_width=True) | |
def process_reviews(raw_reviews, year, month, filename): | |
# Set the index to 'Post_creation_date' | |
raw_reviews.set_index('Post_creation_date', inplace=True) | |
# Convert the index to a DatetimeIndex object | |
raw_reviews.index = pd.to_datetime(raw_reviews.index) | |
# Sort the DataFrame by index | |
raw_reviews = raw_reviews.sort_index() | |
# Clean the text | |
raw_reviews['text'] = raw_reviews['text'].apply(clean_text) | |
raw_reviews['text'] = raw_reviews['text'].apply(remove_emojis) | |
stop_words= ['yourselves', 'between', 'whom', 'itself', 'is', "she's", 'up', 'herself', 'here', 'your', 'each', | |
'we', 'he', 'my', "you've", 'having', 'in', 'both', 'for', 'themselves', 'are', 'them', 'other', | |
'and', 'an', 'during', 'their', 'can', 'yourself', 'she', 'until', 'so', 'these', 'ours', 'above', | |
'what', 'while', 'have', 're', 'more', 'only', "needn't", 'when', 'just', 'that', 'were', "don't", | |
'very', 'should', 'any', 'y', 'isn', 'who', 'a', 'they', 'to', 'too', "should've", 'has', 'before', | |
'into', 'yours', "it's", 'do', 'against', 'on', 'now', 'her', 've', 'd', 'by', 'am', 'from', | |
'about', 'further', "that'll", "you'd", 'you', 'as', 'how', 'been', 'the', 'or', 'doing', 'such', | |
'his', 'himself', 'ourselves', 'was', 'through', 'out', 'below', 'own', 'myself', 'theirs', | |
'me', 'why', 'once', 'him', 'than', 'be', 'most', "you'll", 'same', 'some', 'with', 'few', 'it', | |
'at', 'after', 'its', 'which', 'there','our', 'this', 'hers', 'being', 'did', 'of', 'had', 'under', | |
'over','again', 'where', 'those', 'then', "you're", 'i', 'because', 'does', 'all','link','join','bio','us','we', | |
'myself','we','you','de','la','link','bio'] | |
raw_reviews['text'] = raw_reviews['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)])) | |
# Filter English reviews | |
processed_reviews = filter_english_reviews(raw_reviews) | |
# Filter the reviews based on the specified year and quarter | |
filtered_reviews = raw_reviews[(raw_reviews.index.year == year) & (raw_reviews.index.month == month)] | |
if filtered_reviews.empty: | |
st.title("No available data") | |
return # Return early if the DataFrame is empty | |
filtered_reviews['text'] = filtered_reviews['text'].str[:500] | |
text_list = filtered_reviews['text'].tolist() | |
text_list_truncated = [text[:500] for text in text_list] # Truncate the input sequences to 500 tokens | |
#plot bigram | |
plot_bigram(text_list_truncated,filename) | |
#set header | |
st.header(":rainbow[Instagram Hashtag Analytics] ", divider='rainbow') | |
#import file | |
file = st.sidebar.file_uploader("Import File") | |
year = st.sidebar.selectbox('Year', options=['2023', '2024'], index=1) | |
month = st.sidebar.selectbox('Month', options=['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12'], index=0) | |
if file: | |
#get data | |
def load_data(path): | |
df = pd.read_csv(path) | |
# df.columns = df.columns.str.lower() | |
return df | |
df = load_data(file) | |
process_reviews(df, int(year), int(month),file.name) | |
else: | |
st.write("Please upload your hashtag file") | |