import streamlit as st import pandas as pd from IPython.display import Audio import torch from transformers import pipeline from datasets import load_dataset import soundfile as sf from num2words import num2words import plotly.express as px #import dataset def preprocess_dataset(file_path, num_samples_each_class): dataset = pd.read_excel(file_path) dataset = dataset[dataset['airline_sentiment_confidence'] > 0.5] dataset = dataset[['text', 'airline_sentiment']] num_samples_each_class = num_samples_each_class # Randomly select equal number of samples for each class dataset = dataset.groupby('airline_sentiment').apply(lambda x: x.sample(n=num_samples_each_class)).reset_index(drop=True) text = dataset['text'] sentiments = dataset['airline_sentiment'] return text, sentiments def txt2sentiment(text): pipe = pipeline("text-classification", model="Kayyyy27/fine-tuned-United_Airlines_Twitter_Sentiment_Analysis") result = pipe(text) predicted_label = result[0]['label'] # Define label mapping dictionary label_mapping = {"LABEL_2": "positive", "LABEL_1": "neutral", "LABEL_0": "negative"} # Convert output labels predicted_label = label_mapping[predicted_label] predicted_confidence = "{:.2f}".format(result[0]['score'] * 100) return text, predicted_label, predicted_confidence def sentiment2audio(userinput, sentiment, confidence): pipe3 = pipeline("text-to-speech", "microsoft/speecht5_tts") embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation") speaker_embedding = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0) text = f"{userinput}. The tweet is {sentiment}, with {confidence} percent confidence" speech = pipe3(text, forward_params={"speaker_embeddings": speaker_embedding}) audio = Audio(speech['audio'], rate=speech['sampling_rate']) return audio def audiosummary(sentiment_counts): positive_sentiment_count = sentiment_counts.get('positive', 0) positive_sentiment_count = num2words(positive_sentiment_count, lang='en') negative_sentiment_count = sentiment_counts.get('negative', 0) negative_sentiment_count = num2words(negative_sentiment_count, lang='en') neutral_sentiment_count = sentiment_counts.get('neutral', 0) neutral_sentiment_count = num2words(neutral_sentiment_count, lang='en') text = f"There are {positive_sentiment_count} positive tweets, {negative_sentiment_count} negative tweets, and {neutral_sentiment_count} neutral tweets" pipe3 = pipeline("text-to-speech", "microsoft/speecht5_tts") embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation") speaker_embedding = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0) speech = pipe3(text, forward_params={"speaker_embeddings": speaker_embedding}) audio = Audio(speech['audio'], rate=speech['sampling_rate']) return audio def process_tweet(tweet): # Stage 1: Text to sentiment text, predicted_sentiment, predicted_confidence = txt2sentiment(tweet) # Stage 2: Sentiments to audio audio_output = sentiment2audio(text, predicted_sentiment, num2words(predicted_confidence, lang='en')) # Create a dictionary with the tweet information tweet_info = { 'Input': text, 'Sentiment': predicted_sentiment, 'Confidence': f'{predicted_confidence}%', 'Audio': audio_output } # Return the tweet information return tweet_info def main(): st.header(":rainbow[United Airlines Twitter Sentiment Analysis]",divider='rainbow') st.write(":blue[Enter tweets to analyze its sentiment(seperated by @united) or upload a file:]") # User input options option = st.selectbox("Input type", ["Text Input", "File Upload"]) tweets = [] if option == "Text Input": user_input = st.text_input("Enter tweets (separated by @united)") if user_input: # Split the user input into multiple tweets user_tweets = [tweet.strip() for tweet in user_input.split("@united")[1:]] for tweet in user_tweets: tweet_info = process_tweet(tweet) tweets.append(tweet_info) elif option == "File Upload": uploaded_file = st.file_uploader("Upload csv file", type="csv") if uploaded_file is not None: # Read the uploaded excel file into a pandas DataFrame df = pd.read_csv(uploaded_file) df["text"] = df["text"].str.replace("@united", "") # Process each tweet in the DataFrame for tweet in df["text"]: tweet_info = process_tweet(tweet) tweets.append(tweet_info) st.write(":red[Only the first 5 audio outputs will be displayed]") # Create a DataFrame from the list of tweet information df_tweets = pd.DataFrame(tweets) # Display the DataFrame as a table if df_tweets is not None and not df_tweets.empty: # Create a copy of the DataFrame to add audio buttons df_with_buttons = df_tweets.head(5).copy() # Iterate over the DataFrame rows for index, row in df_with_buttons.iterrows(): audio = row['Audio'] st.audio(audio.data, format="audio/wav", start_time=0) df_tweets = df_tweets[["Input","Sentiment", "Confidence"]] # # Create an expander to show the data output # with st.expander("View All Outputs"): # st.table(df_tweets) st.table(df_tweets) # Add a summary section with a pie chart of sentiments st.subheader(":blue[Summary]") # Calculate sentiment counts sentiment_counts = df_tweets['Sentiment'].value_counts() #create an audio that read the summary output outputsummary = audiosummary(sentiment_counts) st.audio(outputsummary.data, format="audio/wav", start_time=0) # Create a pie chart using Plotly Express fig = px.pie(sentiment_counts, values=sentiment_counts.values, names=sentiment_counts.index) st.plotly_chart(fig) if __name__ == "__main__": main()