Kayyyy27's picture
Upload app.py
33e5a68 verified
raw
history blame contribute delete
No virus
6.16 kB
import streamlit as st
import pandas as pd
from IPython.display import Audio
import torch
from transformers import pipeline
from datasets import load_dataset
import soundfile as sf
from num2words import num2words
import plotly.express as px
#import dataset
def preprocess_dataset(file_path, num_samples_each_class):
dataset = pd.read_excel(file_path)
dataset = dataset[dataset['airline_sentiment_confidence'] > 0.5]
dataset = dataset[['text', 'airline_sentiment']]
num_samples_each_class = num_samples_each_class
# Randomly select equal number of samples for each class
dataset = dataset.groupby('airline_sentiment').apply(lambda x: x.sample(n=num_samples_each_class)).reset_index(drop=True)
text = dataset['text']
sentiments = dataset['airline_sentiment']
return text, sentiments
def txt2sentiment(text):
pipe = pipeline("text-classification", model="Kayyyy27/fine-tuned-United_Airlines_Twitter_Sentiment_Analysis")
result = pipe(text)
predicted_label = result[0]['label']
# Define label mapping dictionary
label_mapping = {"LABEL_2": "positive", "LABEL_1": "neutral", "LABEL_0": "negative"}
# Convert output labels
predicted_label = label_mapping[predicted_label]
predicted_confidence = "{:.2f}".format(result[0]['score'] * 100)
return text, predicted_label, predicted_confidence
def sentiment2audio(userinput, sentiment, confidence):
pipe3 = pipeline("text-to-speech", "microsoft/speecht5_tts")
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embedding = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
text = f"{userinput}. The tweet is {sentiment}, with {confidence} percent confidence"
speech = pipe3(text, forward_params={"speaker_embeddings": speaker_embedding})
audio = Audio(speech['audio'], rate=speech['sampling_rate'])
return audio
def audiosummary(sentiment_counts):
positive_sentiment_count = sentiment_counts.get('positive', 0)
positive_sentiment_count = num2words(positive_sentiment_count, lang='en')
negative_sentiment_count = sentiment_counts.get('negative', 0)
negative_sentiment_count = num2words(negative_sentiment_count, lang='en')
neutral_sentiment_count = sentiment_counts.get('neutral', 0)
neutral_sentiment_count = num2words(neutral_sentiment_count, lang='en')
text = f"There are {positive_sentiment_count} positive tweets, {negative_sentiment_count} negative tweets, and {neutral_sentiment_count} neutral tweets"
pipe3 = pipeline("text-to-speech", "microsoft/speecht5_tts")
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embedding = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
speech = pipe3(text, forward_params={"speaker_embeddings": speaker_embedding})
audio = Audio(speech['audio'], rate=speech['sampling_rate'])
return audio
def process_tweet(tweet):
# Stage 1: Text to sentiment
text, predicted_sentiment, predicted_confidence = txt2sentiment(tweet)
# Stage 2: Sentiments to audio
audio_output = sentiment2audio(text, predicted_sentiment, num2words(predicted_confidence, lang='en'))
# Create a dictionary with the tweet information
tweet_info = {
'Input': text,
'Sentiment': predicted_sentiment,
'Confidence': f'{predicted_confidence}%',
'Audio': audio_output
}
# Return the tweet information
return tweet_info
def main():
st.header(":rainbow[United Airlines Twitter Sentiment Analysis]",divider='rainbow')
st.write(":blue[Enter tweets to analyze its sentiment(seperated by @united) or upload a file:]")
# User input options
option = st.selectbox("Input type", ["Text Input", "File Upload"])
tweets = []
if option == "Text Input":
user_input = st.text_input("Enter tweets (separated by @united)")
if user_input:
# Split the user input into multiple tweets
user_tweets = [tweet.strip() for tweet in user_input.split("@united")[1:]]
for tweet in user_tweets:
tweet_info = process_tweet(tweet)
tweets.append(tweet_info)
elif option == "File Upload":
uploaded_file = st.file_uploader("Upload csv file", type="csv")
if uploaded_file is not None:
# Read the uploaded excel file into a pandas DataFrame
df = pd.read_csv(uploaded_file)
df["text"] = df["text"].str.replace("@united", "")
# Process each tweet in the DataFrame
for tweet in df["text"]:
tweet_info = process_tweet(tweet)
tweets.append(tweet_info)
st.write(":red[Only the first 5 audio outputs will be displayed]")
# Create a DataFrame from the list of tweet information
df_tweets = pd.DataFrame(tweets)
# Display the DataFrame as a table
if df_tweets is not None and not df_tweets.empty:
# Create a copy of the DataFrame to add audio buttons
df_with_buttons = df_tweets.head(5).copy()
# Iterate over the DataFrame rows
for index, row in df_with_buttons.iterrows():
audio = row['Audio']
st.audio(audio.data, format="audio/wav", start_time=0)
df_tweets = df_tweets[["Input","Sentiment", "Confidence"]]
# # Create an expander to show the data output
# with st.expander("View All Outputs"):
# st.table(df_tweets)
st.table(df_tweets)
# Add a summary section with a pie chart of sentiments
st.subheader(":blue[Summary]")
# Calculate sentiment counts
sentiment_counts = df_tweets['Sentiment'].value_counts()
#create an audio that read the summary output
outputsummary = audiosummary(sentiment_counts)
st.audio(outputsummary.data, format="audio/wav", start_time=0)
# Create a pie chart using Plotly Express
fig = px.pie(sentiment_counts, values=sentiment_counts.values, names=sentiment_counts.index)
st.plotly_chart(fig)
if __name__ == "__main__":
main()