Spaces:
Runtime error
Runtime error
taliida-nabilah
commited on
Commit
•
2df2b07
1
Parent(s):
df9d6c1
first commit
Browse files- app.py +156 -0
- eda.py +68 -0
- requirements.txt +6 -0
- twitter_training.csv +0 -0
- twitter_validation.csv +0 -0
app.py
ADDED
@@ -0,0 +1,156 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pandas as pd
|
3 |
+
from eda import plot_pie_chart, plot_heatmap, plot_kde
|
4 |
+
|
5 |
+
st.set_option('deprecation.showPyplotGlobalUse', False)
|
6 |
+
|
7 |
+
column_names= ['tweet_id','entity','sentiment','content']
|
8 |
+
|
9 |
+
# Load the dataset
|
10 |
+
train = pd.read_csv("twitter_training.csv",
|
11 |
+
sep=',',names=column_names)
|
12 |
+
|
13 |
+
|
14 |
+
validation = pd.read_csv("twitter_validation.csv",
|
15 |
+
sep=',',names=column_names)
|
16 |
+
|
17 |
+
# Load the model pipeline
|
18 |
+
# pipeline = load('model_pipeline.pkl')
|
19 |
+
|
20 |
+
# Load the selected features
|
21 |
+
# with open('selected_features.txt', 'r') as file_1:
|
22 |
+
# selected_columns = json.load(file_1)
|
23 |
+
|
24 |
+
# with open('categorical_features.txt', 'r') as file_2:
|
25 |
+
# categorical_columns = json.load(file_2)
|
26 |
+
|
27 |
+
|
28 |
+
# Set page config
|
29 |
+
st.set_page_config(
|
30 |
+
page_icon=":chart_with_upwards_trend:",
|
31 |
+
page_title="Default Payment Prediction",
|
32 |
+
layout="wide",
|
33 |
+
initial_sidebar_state="expanded"
|
34 |
+
)
|
35 |
+
|
36 |
+
# Sidebar for selecting the page
|
37 |
+
page = st.sidebar.selectbox("Select a page", ["EDA", "Prediction"])
|
38 |
+
|
39 |
+
if page == "EDA":
|
40 |
+
st.title(" Exploratory Data Analysis ")
|
41 |
+
# Add your EDA code here
|
42 |
+
|
43 |
+
# Custom CSS for the scroll bar
|
44 |
+
st.markdown(
|
45 |
+
"""
|
46 |
+
<style>
|
47 |
+
::-webkit-scrollbar {
|
48 |
+
width: 12px;
|
49 |
+
}
|
50 |
+
::-webkit-scrollbar-track {
|
51 |
+
background: #f1f1f1;
|
52 |
+
}
|
53 |
+
::-webkit-scrollbar-thumb {
|
54 |
+
background: #888;
|
55 |
+
}
|
56 |
+
::-webkit-scrollbar-thumb:hover {
|
57 |
+
background: #555;
|
58 |
+
}
|
59 |
+
</style>
|
60 |
+
""",
|
61 |
+
unsafe_allow_html=True
|
62 |
+
)
|
63 |
+
|
64 |
+
# Display the DataFrame with a scroll bar and styled title
|
65 |
+
st.write(
|
66 |
+
"<div style='overflow-x: auto; text-align:center;'><span style='font-size:24px; color:#FF5733;'>The DataFrame of Train</span></div>",
|
67 |
+
unsafe_allow_html=True
|
68 |
+
)
|
69 |
+
|
70 |
+
st.write(train)
|
71 |
+
|
72 |
+
st.write(
|
73 |
+
"<div style='overflow-x: auto; text-align:center;'><span style='font-size:24px; color:#FF5733;'>The DataFrame of Validation</span></div>",
|
74 |
+
unsafe_allow_html=True
|
75 |
+
)
|
76 |
+
|
77 |
+
st.write(validation)
|
78 |
+
|
79 |
+
|
80 |
+
train_title = "Train Dataset"
|
81 |
+
validation_title = "Validation Dataset"
|
82 |
+
|
83 |
+
# Display insight title
|
84 |
+
st.write(
|
85 |
+
"<div style='text-align:center; margin-bottom: 20px;'><h2 style='font-size:24px; color:#FF5733;'>Proporsion of Sentiment Distribution</h2></div>",
|
86 |
+
unsafe_allow_html=True
|
87 |
+
)
|
88 |
+
|
89 |
+
# Plot pie chart for train dataset
|
90 |
+
fig_train_pie = plot_pie_chart(train, "Train Dataset")
|
91 |
+
st.pyplot(fig_train_pie)
|
92 |
+
|
93 |
+
# Plot pie chart for validation dataset
|
94 |
+
fig_validation_pie = plot_pie_chart(validation, "Validation Dataset")
|
95 |
+
st.pyplot(fig_validation_pie)
|
96 |
+
|
97 |
+
# Display insight paragraph
|
98 |
+
st.write("The dataset shows a variety of sentiments, with the largest portion being negative at 22,542 instances, closely followed by positive sentiments at 20,832 instances. Neutral sentiments are also notable, though slightly less frequent, with 18,318 occurrences. In contrast, irrelevant sentiments are the least represented, appearing only 12,990 times. Overall, the dataset appears balanced, albeit with a slight lean towards negative sentiments. Researchers and analysts should be aware of this imbalance, as it could impact result interpretation and the performance of sentiment analysis models. Strategies like data augmentation or weighted modeling could help address this imbalance and ensure more accurate and unbiased results.")
|
99 |
+
|
100 |
+
# Display insight title
|
101 |
+
st.write(
|
102 |
+
"<div style='text-align:center; margin-bottom: 20px;'><h2 style='font-size:24px; color:#FF5733;'>Count of Sentiment per Entity Distribution</h2></div>",
|
103 |
+
unsafe_allow_html=True
|
104 |
+
)
|
105 |
+
|
106 |
+
# Plot heatmap for train dataset
|
107 |
+
fig_train_heatmap = plot_heatmap(train, "Train Dataset")
|
108 |
+
st.pyplot(fig_train_heatmap)
|
109 |
+
|
110 |
+
# Plot heatmap for validation dataset
|
111 |
+
fig_validation_heatmap = plot_heatmap(validation, "Validation Dataset")
|
112 |
+
st.pyplot(fig_validation_heatmap)
|
113 |
+
|
114 |
+
# Display insight paragraph
|
115 |
+
st.write("The data reveals varying sentiments across different entities. Some, like **AssassinsCreed**, **Borderlands**, and **GrandTheftAuto(GTA)**, are widely praised, with a strong positive sentiment. Others, such as **Battlefield**, **CS-GO**, and **LeagueOfLegends**, are more contentious, with a higher occurrence of negative sentiment. Interestingly, **Amazon** and **ApexLegends** have a balanced mix of sentiments, leaning slightly towards neutral or positive mentions. Conversely, **Fortnite**, **Facebook**, and **HomeDepot** show a more mixed sentiment, with negativity slightly outweighing positivity. These findings can provide valuable insights for companies and developers looking to gauge public opinion and improve their products or services.")
|
116 |
+
|
117 |
+
# Display insight title
|
118 |
+
st.write(
|
119 |
+
"<div style='text-align:center; margin-bottom: 20px;'><h2 style='font-size:24px; color:#FF5733;'>Count Density Distribution</h2></div>",
|
120 |
+
unsafe_allow_html=True
|
121 |
+
)
|
122 |
+
|
123 |
+
# Display KDE plot for train dataset
|
124 |
+
fig_train_kde = plot_kde(train, "Train Dataset")
|
125 |
+
st.pyplot(fig_train_kde)
|
126 |
+
|
127 |
+
# Display KDE plot for validation dataset
|
128 |
+
fig_validation_kde = plot_kde(validation, "Validation Dataset")
|
129 |
+
st.pyplot(fig_validation_kde)
|
130 |
+
|
131 |
+
# Display insight paragraph
|
132 |
+
st.write("The viz showcases a distinct trend in the content density of sentiment categories between the training and validation datasets. In the training dataset, the content density is observed to be highest for positive sentiments, followed by negative, neutral, and finally irrelevant sentiments. This ordering suggests that the training dataset contains a higher concentration of positive sentiments compared to negative sentiments, with neutral sentiments falling in between. However, in the validation dataset, the order of content density differs, with neutral sentiments having the highest density, followed by negative, positive, and finally irrelevant sentiments. This shift in density distribution between the two datasets implies potential discrepancies in sentiment representation and distribution, which could impact the performance and generalization capabilities of sentiment analysis models trained on the training dataset and evaluated on the validation dataset. Therefore, it becomes crucial to consider and address these variations during model development and evaluation to ensure robust performance across different sentiment categories.")
|
133 |
+
|
134 |
+
elif page == "Prediction":
|
135 |
+
st.title(" Prediction ")
|
136 |
+
|
137 |
+
# # Prediction bar for user input
|
138 |
+
# with st.expander("User Input", expanded=True):
|
139 |
+
# # Create placeholders for user input
|
140 |
+
# user_input = {}
|
141 |
+
|
142 |
+
# # Create input fields for each selected feature
|
143 |
+
# for feature in selected_columns:
|
144 |
+
# st.markdown(f"<div style='margin-bottom:10px'>{feature}</div>", unsafe_allow_html=True)
|
145 |
+
# user_input[feature] = st.text_input("", key=feature)
|
146 |
+
|
147 |
+
# # Convert user input to DataFrame
|
148 |
+
# input_df = pd.DataFrame([user_input])
|
149 |
+
|
150 |
+
# # Keep only selected features
|
151 |
+
# input_df = input_df[selected_columns]
|
152 |
+
|
153 |
+
# # Predict button
|
154 |
+
# if st.button('Predict'):
|
155 |
+
# prediction = pipeline.predict(input_df)
|
156 |
+
# st.write('Prediction:', prediction[0])
|
eda.py
ADDED
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import seaborn as sns
|
3 |
+
import matplotlib.pyplot as plt
|
4 |
+
|
5 |
+
def plot_pie_chart(data, title):
|
6 |
+
# Define colors for each sentiment class
|
7 |
+
colors = {
|
8 |
+
'Negative': '#8B0000', # Dark red (blood red)
|
9 |
+
'Positive': '#6B8E23', # OliveDrab
|
10 |
+
'Irrelevant': '#808080', # Gray
|
11 |
+
'Neutral': '#4169E1' # RoyalBlue
|
12 |
+
}
|
13 |
+
|
14 |
+
# Define the desired order
|
15 |
+
desired_order = ['Positive', 'Negative', 'Neutral', 'Irrelevant']
|
16 |
+
|
17 |
+
# Plot pie chart
|
18 |
+
sorted_data = data['sentiment'].value_counts()[desired_order]
|
19 |
+
fig, ax = plt.subplots()
|
20 |
+
wedges, _, autotexts = ax.pie(sorted_data, labels=sorted_data.index, autopct='%.1f%%', colors=[colors.get(sentiment, '#CCCCCC') for sentiment in sorted_data.index], startangle=90)
|
21 |
+
plt.title(title)
|
22 |
+
|
23 |
+
# Add count below the percentage
|
24 |
+
for autotext, count in zip(autotexts, sorted_data):
|
25 |
+
autotext.set_text(f"{autotext.get_text()}\n({count})")
|
26 |
+
|
27 |
+
# Equal aspect ratio ensures that pie is drawn as a circle.
|
28 |
+
plt.axis('equal')
|
29 |
+
|
30 |
+
# Return plot
|
31 |
+
return fig
|
32 |
+
|
33 |
+
def plot_heatmap(data, title):
|
34 |
+
# Group by entity and count occurrences, then reset the index
|
35 |
+
entity_counts = data.groupby(by=["entity", "sentiment"]).size().unstack(fill_value=0)
|
36 |
+
|
37 |
+
# Plot heatmap
|
38 |
+
fig, ax = plt.subplots()
|
39 |
+
sns.heatmap(entity_counts, cmap="coolwarm", annot=True, fmt='d', linewidths=.5, ax=ax)
|
40 |
+
plt.title(title)
|
41 |
+
plt.xlabel("Sentiment")
|
42 |
+
plt.ylabel("Entity")
|
43 |
+
plt.xticks(rotation=45)
|
44 |
+
plt.tight_layout()
|
45 |
+
|
46 |
+
# Return plot
|
47 |
+
return fig
|
48 |
+
|
49 |
+
def plot_kde(data, title):
|
50 |
+
# Define colors for each sentiment class
|
51 |
+
colors = {
|
52 |
+
'Negative': '#8B0000', # Dark red (blood red)
|
53 |
+
'Positive': '#6B8E23', # OliveDrab
|
54 |
+
'Irrelevant': '#808080', # Gray
|
55 |
+
'Neutral': '#4169E1' # RoyalBlue
|
56 |
+
}
|
57 |
+
|
58 |
+
# Define the desired order
|
59 |
+
desired_order = ['Positive', 'Negative', 'Neutral', 'Irrelevant']
|
60 |
+
|
61 |
+
# Plot KDE plots for train dataset
|
62 |
+
fig, ax = plt.subplots(figsize=(14, 8))
|
63 |
+
sns.kdeplot(data=data[data['content'].apply(lambda x: isinstance(x, str))], x=data['content'].apply(lambda x: len(x) if isinstance(x, str) else None), hue='sentiment', palette=colors, linewidth=3, ax=ax, hue_order=desired_order)
|
64 |
+
ax.set_title(title)
|
65 |
+
|
66 |
+
# Return plot
|
67 |
+
return fig
|
68 |
+
|
requirements.txt
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
streamlit
|
2 |
+
joblib
|
3 |
+
pandas
|
4 |
+
matplotlib
|
5 |
+
scikit-learn
|
6 |
+
seaborn
|
twitter_training.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
twitter_validation.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|