File size: 6,158 Bytes
0fa70df
 
 
 
 
 
 
 
 
3593cb1
0fa70df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75c9e82
0fa70df
cd0e2ba
0fa70df
 
 
cd0e2ba
0fa70df
 
 
 
357d56c
96dbda2
082be25
 
96dbda2
082be25
 
96dbda2
082be25
 
6c1dca8
631bf65
 
 
 
 
 
 
 
9227144
1916040
75c9e82
9227144
1916040
75c9e82
9227144
1916040
 
75c9e82
1916040
cfad33c
1916040
 
 
666eadf
 
 
9227144
0fa70df
 
9227144
3593cb1
9227144
 
 
 
1916040
 
9227144
ec7a369
9227144
 
 
8625123
9227144
904173e
666eadf
 
 
1916040
e54bd75
 
9227144
e54bd75
 
 
33e5a68
 
e54bd75
37d43c2
666eadf
 
 
c8ca882
1916040
 
631bf65
1916040
904173e
631bf65
ec7a369
631bf65
ec7a369
 
 
f1aefa1
6c515f5
51d1ed9
c8ca882
ec7a369
6c515f5
 
 
 
28c6e6d
3593cb1
95e55e7
3593cb1
 
6d42812
3593cb1
357d56c
 
99b8a87
631bf65
5797c96
 
 
3593cb1
 
51d1ed9
0fa70df
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156

import streamlit as st
import pandas as pd
from IPython.display import Audio
import torch
from transformers import pipeline
from datasets import load_dataset
import soundfile as sf
from num2words import num2words
import plotly.express as px

#import dataset
def preprocess_dataset(file_path, num_samples_each_class):
    dataset = pd.read_excel(file_path)
    dataset = dataset[dataset['airline_sentiment_confidence'] > 0.5]
    dataset = dataset[['text', 'airline_sentiment']]
    num_samples_each_class = num_samples_each_class
    # Randomly select equal number of samples for each class
    dataset = dataset.groupby('airline_sentiment').apply(lambda x: x.sample(n=num_samples_each_class)).reset_index(drop=True)
    text = dataset['text']
    sentiments = dataset['airline_sentiment']
    return text, sentiments

def txt2sentiment(text):
    pipe = pipeline("text-classification", model="Kayyyy27/fine-tuned-United_Airlines_Twitter_Sentiment_Analysis")
    result = pipe(text)
    predicted_label = result[0]['label']
    # Define label mapping dictionary
    label_mapping = {"LABEL_2": "positive",  "LABEL_1": "neutral",  "LABEL_0": "negative"}
    # Convert output labels
    predicted_label = label_mapping[predicted_label]
    predicted_confidence = "{:.2f}".format(result[0]['score'] * 100)
    return text, predicted_label, predicted_confidence

def sentiment2audio(userinput, sentiment, confidence):
    pipe3 = pipeline("text-to-speech", "microsoft/speecht5_tts")
    embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
    speaker_embedding = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
    text = f"{userinput}. The tweet is {sentiment}, with {confidence} percent confidence"
    speech = pipe3(text, forward_params={"speaker_embeddings": speaker_embedding})
    audio = Audio(speech['audio'], rate=speech['sampling_rate'])
    return audio

def audiosummary(sentiment_counts):
    positive_sentiment_count = sentiment_counts.get('positive', 0)
    positive_sentiment_count = num2words(positive_sentiment_count, lang='en')

    negative_sentiment_count = sentiment_counts.get('negative', 0)
    negative_sentiment_count = num2words(negative_sentiment_count, lang='en')

    neutral_sentiment_count = sentiment_counts.get('neutral', 0)
    neutral_sentiment_count = num2words(neutral_sentiment_count, lang='en')

    text = f"There are {positive_sentiment_count} positive tweets, {negative_sentiment_count} negative tweets, and {neutral_sentiment_count} neutral tweets"
    pipe3 = pipeline("text-to-speech", "microsoft/speecht5_tts")
    embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
    speaker_embedding = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
    speech = pipe3(text, forward_params={"speaker_embeddings": speaker_embedding})
    audio = Audio(speech['audio'], rate=speech['sampling_rate'])
    return audio


def process_tweet(tweet):
    # Stage 1: Text to sentiment
    text, predicted_sentiment, predicted_confidence = txt2sentiment(tweet)

    # Stage 2: Sentiments to audio
    audio_output = sentiment2audio(text, predicted_sentiment, num2words(predicted_confidence, lang='en'))

    # Create a dictionary with the tweet information
    tweet_info = {
        'Input': text,
        'Sentiment': predicted_sentiment,
        'Confidence': f'{predicted_confidence}%',
        'Audio': audio_output
    }


    # Return the tweet information
    return tweet_info

def main():

    st.header(":rainbow[United Airlines Twitter Sentiment Analysis]",divider='rainbow')
    st.write(":blue[Enter tweets to analyze its sentiment(seperated by @united) or upload a file:]")

    # User input options
    option = st.selectbox("Input type", ["Text Input", "File Upload"])

    tweets = []

    if option == "Text Input":
        user_input = st.text_input("Enter tweets (separated by @united)")

        if user_input:
            # Split the user input into multiple tweets
            user_tweets = [tweet.strip() for tweet in user_input.split("@united")[1:]]

            for tweet in user_tweets:
                tweet_info = process_tweet(tweet)
                tweets.append(tweet_info)

            
    elif option == "File Upload":
        uploaded_file = st.file_uploader("Upload csv file", type="csv")

        if uploaded_file is not None:
            # Read the uploaded excel file into a pandas DataFrame
            df = pd.read_csv(uploaded_file)
            df["text"] = df["text"].str.replace("@united", "")

            # Process each tweet in the DataFrame
            for tweet in df["text"]:
                tweet_info = process_tweet(tweet)
                tweets.append(tweet_info)

    st.write(":red[Only the first 5 audio outputs will be displayed]")
    # Create a DataFrame from the list of tweet information
    df_tweets = pd.DataFrame(tweets)

    # Display the DataFrame as a table
    if df_tweets is not None and not df_tweets.empty:

        # Create a copy of the DataFrame to add audio buttons
        df_with_buttons = df_tweets.head(5).copy()

        # Iterate over the DataFrame rows
        for index, row in df_with_buttons.iterrows():
            audio = row['Audio']
            st.audio(audio.data, format="audio/wav", start_time=0)

        df_tweets = df_tweets[["Input","Sentiment", "Confidence"]]

        # # Create an expander to show the data output
        # with st.expander("View All Outputs"):
        #     st.table(df_tweets)
        st.table(df_tweets)

        # Add a summary section with a pie chart of sentiments
        st.subheader(":blue[Summary]")

        # Calculate sentiment counts
        sentiment_counts = df_tweets['Sentiment'].value_counts()

        #create an audio that read the summary output
        outputsummary = audiosummary(sentiment_counts)
        st.audio(outputsummary.data, format="audio/wav", start_time=0)

        # Create a pie chart using Plotly Express
        fig = px.pie(sentiment_counts, values=sentiment_counts.values, names=sentiment_counts.index)
        st.plotly_chart(fig)



if __name__ == "__main__":
    main()