taliida-nabilah commited on
Commit
2df2b07
1 Parent(s): df9d6c1

first commit

Browse files
Files changed (5) hide show
  1. app.py +156 -0
  2. eda.py +68 -0
  3. requirements.txt +6 -0
  4. twitter_training.csv +0 -0
  5. twitter_validation.csv +0 -0
app.py ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ from eda import plot_pie_chart, plot_heatmap, plot_kde
4
+
5
+ st.set_option('deprecation.showPyplotGlobalUse', False)
6
+
7
+ column_names= ['tweet_id','entity','sentiment','content']
8
+
9
+ # Load the dataset
10
+ train = pd.read_csv("twitter_training.csv",
11
+ sep=',',names=column_names)
12
+
13
+
14
+ validation = pd.read_csv("twitter_validation.csv",
15
+ sep=',',names=column_names)
16
+
17
+ # Load the model pipeline
18
+ # pipeline = load('model_pipeline.pkl')
19
+
20
+ # Load the selected features
21
+ # with open('selected_features.txt', 'r') as file_1:
22
+ # selected_columns = json.load(file_1)
23
+
24
+ # with open('categorical_features.txt', 'r') as file_2:
25
+ # categorical_columns = json.load(file_2)
26
+
27
+
28
+ # Set page config
29
+ st.set_page_config(
30
+ page_icon=":chart_with_upwards_trend:",
31
+ page_title="Default Payment Prediction",
32
+ layout="wide",
33
+ initial_sidebar_state="expanded"
34
+ )
35
+
36
+ # Sidebar for selecting the page
37
+ page = st.sidebar.selectbox("Select a page", ["EDA", "Prediction"])
38
+
39
+ if page == "EDA":
40
+ st.title(" Exploratory Data Analysis ")
41
+ # Add your EDA code here
42
+
43
+ # Custom CSS for the scroll bar
44
+ st.markdown(
45
+ """
46
+ <style>
47
+ ::-webkit-scrollbar {
48
+ width: 12px;
49
+ }
50
+ ::-webkit-scrollbar-track {
51
+ background: #f1f1f1;
52
+ }
53
+ ::-webkit-scrollbar-thumb {
54
+ background: #888;
55
+ }
56
+ ::-webkit-scrollbar-thumb:hover {
57
+ background: #555;
58
+ }
59
+ </style>
60
+ """,
61
+ unsafe_allow_html=True
62
+ )
63
+
64
+ # Display the DataFrame with a scroll bar and styled title
65
+ st.write(
66
+ "<div style='overflow-x: auto; text-align:center;'><span style='font-size:24px; color:#FF5733;'>The DataFrame of Train</span></div>",
67
+ unsafe_allow_html=True
68
+ )
69
+
70
+ st.write(train)
71
+
72
+ st.write(
73
+ "<div style='overflow-x: auto; text-align:center;'><span style='font-size:24px; color:#FF5733;'>The DataFrame of Validation</span></div>",
74
+ unsafe_allow_html=True
75
+ )
76
+
77
+ st.write(validation)
78
+
79
+
80
+ train_title = "Train Dataset"
81
+ validation_title = "Validation Dataset"
82
+
83
+ # Display insight title
84
+ st.write(
85
+ "<div style='text-align:center; margin-bottom: 20px;'><h2 style='font-size:24px; color:#FF5733;'>Proporsion of Sentiment Distribution</h2></div>",
86
+ unsafe_allow_html=True
87
+ )
88
+
89
+ # Plot pie chart for train dataset
90
+ fig_train_pie = plot_pie_chart(train, "Train Dataset")
91
+ st.pyplot(fig_train_pie)
92
+
93
+ # Plot pie chart for validation dataset
94
+ fig_validation_pie = plot_pie_chart(validation, "Validation Dataset")
95
+ st.pyplot(fig_validation_pie)
96
+
97
+ # Display insight paragraph
98
+ st.write("The dataset shows a variety of sentiments, with the largest portion being negative at 22,542 instances, closely followed by positive sentiments at 20,832 instances. Neutral sentiments are also notable, though slightly less frequent, with 18,318 occurrences. In contrast, irrelevant sentiments are the least represented, appearing only 12,990 times. Overall, the dataset appears balanced, albeit with a slight lean towards negative sentiments. Researchers and analysts should be aware of this imbalance, as it could impact result interpretation and the performance of sentiment analysis models. Strategies like data augmentation or weighted modeling could help address this imbalance and ensure more accurate and unbiased results.")
99
+
100
+ # Display insight title
101
+ st.write(
102
+ "<div style='text-align:center; margin-bottom: 20px;'><h2 style='font-size:24px; color:#FF5733;'>Count of Sentiment per Entity Distribution</h2></div>",
103
+ unsafe_allow_html=True
104
+ )
105
+
106
+ # Plot heatmap for train dataset
107
+ fig_train_heatmap = plot_heatmap(train, "Train Dataset")
108
+ st.pyplot(fig_train_heatmap)
109
+
110
+ # Plot heatmap for validation dataset
111
+ fig_validation_heatmap = plot_heatmap(validation, "Validation Dataset")
112
+ st.pyplot(fig_validation_heatmap)
113
+
114
+ # Display insight paragraph
115
+ st.write("The data reveals varying sentiments across different entities. Some, like **AssassinsCreed**, **Borderlands**, and **GrandTheftAuto(GTA)**, are widely praised, with a strong positive sentiment. Others, such as **Battlefield**, **CS-GO**, and **LeagueOfLegends**, are more contentious, with a higher occurrence of negative sentiment. Interestingly, **Amazon** and **ApexLegends** have a balanced mix of sentiments, leaning slightly towards neutral or positive mentions. Conversely, **Fortnite**, **Facebook**, and **HomeDepot** show a more mixed sentiment, with negativity slightly outweighing positivity. These findings can provide valuable insights for companies and developers looking to gauge public opinion and improve their products or services.")
116
+
117
+ # Display insight title
118
+ st.write(
119
+ "<div style='text-align:center; margin-bottom: 20px;'><h2 style='font-size:24px; color:#FF5733;'>Count Density Distribution</h2></div>",
120
+ unsafe_allow_html=True
121
+ )
122
+
123
+ # Display KDE plot for train dataset
124
+ fig_train_kde = plot_kde(train, "Train Dataset")
125
+ st.pyplot(fig_train_kde)
126
+
127
+ # Display KDE plot for validation dataset
128
+ fig_validation_kde = plot_kde(validation, "Validation Dataset")
129
+ st.pyplot(fig_validation_kde)
130
+
131
+ # Display insight paragraph
132
+ st.write("The viz showcases a distinct trend in the content density of sentiment categories between the training and validation datasets. In the training dataset, the content density is observed to be highest for positive sentiments, followed by negative, neutral, and finally irrelevant sentiments. This ordering suggests that the training dataset contains a higher concentration of positive sentiments compared to negative sentiments, with neutral sentiments falling in between. However, in the validation dataset, the order of content density differs, with neutral sentiments having the highest density, followed by negative, positive, and finally irrelevant sentiments. This shift in density distribution between the two datasets implies potential discrepancies in sentiment representation and distribution, which could impact the performance and generalization capabilities of sentiment analysis models trained on the training dataset and evaluated on the validation dataset. Therefore, it becomes crucial to consider and address these variations during model development and evaluation to ensure robust performance across different sentiment categories.")
133
+
134
+ elif page == "Prediction":
135
+ st.title(" Prediction ")
136
+
137
+ # # Prediction bar for user input
138
+ # with st.expander("User Input", expanded=True):
139
+ # # Create placeholders for user input
140
+ # user_input = {}
141
+
142
+ # # Create input fields for each selected feature
143
+ # for feature in selected_columns:
144
+ # st.markdown(f"<div style='margin-bottom:10px'>{feature}</div>", unsafe_allow_html=True)
145
+ # user_input[feature] = st.text_input("", key=feature)
146
+
147
+ # # Convert user input to DataFrame
148
+ # input_df = pd.DataFrame([user_input])
149
+
150
+ # # Keep only selected features
151
+ # input_df = input_df[selected_columns]
152
+
153
+ # # Predict button
154
+ # if st.button('Predict'):
155
+ # prediction = pipeline.predict(input_df)
156
+ # st.write('Prediction:', prediction[0])
eda.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import seaborn as sns
3
+ import matplotlib.pyplot as plt
4
+
5
+ def plot_pie_chart(data, title):
6
+ # Define colors for each sentiment class
7
+ colors = {
8
+ 'Negative': '#8B0000', # Dark red (blood red)
9
+ 'Positive': '#6B8E23', # OliveDrab
10
+ 'Irrelevant': '#808080', # Gray
11
+ 'Neutral': '#4169E1' # RoyalBlue
12
+ }
13
+
14
+ # Define the desired order
15
+ desired_order = ['Positive', 'Negative', 'Neutral', 'Irrelevant']
16
+
17
+ # Plot pie chart
18
+ sorted_data = data['sentiment'].value_counts()[desired_order]
19
+ fig, ax = plt.subplots()
20
+ wedges, _, autotexts = ax.pie(sorted_data, labels=sorted_data.index, autopct='%.1f%%', colors=[colors.get(sentiment, '#CCCCCC') for sentiment in sorted_data.index], startangle=90)
21
+ plt.title(title)
22
+
23
+ # Add count below the percentage
24
+ for autotext, count in zip(autotexts, sorted_data):
25
+ autotext.set_text(f"{autotext.get_text()}\n({count})")
26
+
27
+ # Equal aspect ratio ensures that pie is drawn as a circle.
28
+ plt.axis('equal')
29
+
30
+ # Return plot
31
+ return fig
32
+
33
+ def plot_heatmap(data, title):
34
+ # Group by entity and count occurrences, then reset the index
35
+ entity_counts = data.groupby(by=["entity", "sentiment"]).size().unstack(fill_value=0)
36
+
37
+ # Plot heatmap
38
+ fig, ax = plt.subplots()
39
+ sns.heatmap(entity_counts, cmap="coolwarm", annot=True, fmt='d', linewidths=.5, ax=ax)
40
+ plt.title(title)
41
+ plt.xlabel("Sentiment")
42
+ plt.ylabel("Entity")
43
+ plt.xticks(rotation=45)
44
+ plt.tight_layout()
45
+
46
+ # Return plot
47
+ return fig
48
+
49
+ def plot_kde(data, title):
50
+ # Define colors for each sentiment class
51
+ colors = {
52
+ 'Negative': '#8B0000', # Dark red (blood red)
53
+ 'Positive': '#6B8E23', # OliveDrab
54
+ 'Irrelevant': '#808080', # Gray
55
+ 'Neutral': '#4169E1' # RoyalBlue
56
+ }
57
+
58
+ # Define the desired order
59
+ desired_order = ['Positive', 'Negative', 'Neutral', 'Irrelevant']
60
+
61
+ # Plot KDE plots for train dataset
62
+ fig, ax = plt.subplots(figsize=(14, 8))
63
+ sns.kdeplot(data=data[data['content'].apply(lambda x: isinstance(x, str))], x=data['content'].apply(lambda x: len(x) if isinstance(x, str) else None), hue='sentiment', palette=colors, linewidth=3, ax=ax, hue_order=desired_order)
64
+ ax.set_title(title)
65
+
66
+ # Return plot
67
+ return fig
68
+
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ streamlit
2
+ joblib
3
+ pandas
4
+ matplotlib
5
+ scikit-learn
6
+ seaborn
twitter_training.csv ADDED
The diff for this file is too large to render. See raw diff
 
twitter_validation.csv ADDED
The diff for this file is too large to render. See raw diff