import streamlit as st import pandas as pd from eda import plot_pie_chart, plot_heatmap, plot_kde st.set_option('deprecation.showPyplotGlobalUse', False) column_names= ['tweet_id','entity','sentiment','content'] # Load the dataset train = pd.read_csv("twitter_training.csv", sep=',',names=column_names) validation = pd.read_csv("twitter_validation.csv", sep=',',names=column_names) # Load the model pipeline # pipeline = load('model_pipeline.pkl') # Load the selected features # with open('selected_features.txt', 'r') as file_1: # selected_columns = json.load(file_1) # with open('categorical_features.txt', 'r') as file_2: # categorical_columns = json.load(file_2) # Set page config st.set_page_config( page_icon=":chart_with_upwards_trend:", page_title="Default Payment Prediction", layout="wide", initial_sidebar_state="expanded" ) # Sidebar for selecting the page page = st.sidebar.selectbox("Select a page", ["EDA", "Prediction"]) if page == "EDA": st.title(" Exploratory Data Analysis ") # Add your EDA code here # Custom CSS for the scroll bar st.markdown( """ """, unsafe_allow_html=True ) # Display the DataFrame with a scroll bar and styled title st.write( "

The DataFrame of Train

", unsafe_allow_html=True ) st.write(train) st.write( "

The DataFrame of Validation

", unsafe_allow_html=True ) st.write(validation) train_title = "Train Dataset" validation_title = "Validation Dataset" # Display insight title st.write( "

Proporsion of Sentiment Distribution

", unsafe_allow_html=True ) # Plot pie chart for train dataset fig_train_pie = plot_pie_chart(train, "Train Dataset") st.pyplot(fig_train_pie) # Plot pie chart for validation dataset fig_validation_pie = plot_pie_chart(validation, "Validation Dataset") st.pyplot(fig_validation_pie) # Display insight paragraph st.write("The dataset shows a variety of sentiments, with the largest portion being negative at 22,542 instances, closely followed by positive sentiments at 20,832 instances. Neutral sentiments are also notable, though slightly less frequent, with 18,318 occurrences. In contrast, irrelevant sentiments are the least represented, appearing only 12,990 times. Overall, the dataset appears balanced, albeit with a slight lean towards negative sentiments. Researchers and analysts should be aware of this imbalance, as it could impact result interpretation and the performance of sentiment analysis models. Strategies like data augmentation or weighted modeling could help address this imbalance and ensure more accurate and unbiased results.") # Display insight title st.write( "

Count of Sentiment per Entity Distribution

", unsafe_allow_html=True ) # Plot heatmap for train dataset fig_train_heatmap = plot_heatmap(train, "Train Dataset") st.pyplot(fig_train_heatmap) # Plot heatmap for validation dataset fig_validation_heatmap = plot_heatmap(validation, "Validation Dataset") st.pyplot(fig_validation_heatmap) # Display insight paragraph st.write("The data reveals varying sentiments across different entities. Some, like **AssassinsCreed**, **Borderlands**, and **GrandTheftAuto(GTA)**, are widely praised, with a strong positive sentiment. Others, such as **Battlefield**, **CS-GO**, and **LeagueOfLegends**, are more contentious, with a higher occurrence of negative sentiment. Interestingly, **Amazon** and **ApexLegends** have a balanced mix of sentiments, leaning slightly towards neutral or positive mentions. Conversely, **Fortnite**, **Facebook**, and **HomeDepot** show a more mixed sentiment, with negativity slightly outweighing positivity. These findings can provide valuable insights for companies and developers looking to gauge public opinion and improve their products or services.") # Display insight title st.write( "

Count Density Distribution

", unsafe_allow_html=True ) # Display KDE plot for train dataset fig_train_kde = plot_kde(train, "Train Dataset") st.pyplot(fig_train_kde) # Display KDE plot for validation dataset fig_validation_kde = plot_kde(validation, "Validation Dataset") st.pyplot(fig_validation_kde) # Display insight paragraph st.write("The viz showcases a distinct trend in the content density of sentiment categories between the training and validation datasets. In the training dataset, the content density is observed to be highest for positive sentiments, followed by negative, neutral, and finally irrelevant sentiments. This ordering suggests that the training dataset contains a higher concentration of positive sentiments compared to negative sentiments, with neutral sentiments falling in between. However, in the validation dataset, the order of content density differs, with neutral sentiments having the highest density, followed by negative, positive, and finally irrelevant sentiments. This shift in density distribution between the two datasets implies potential discrepancies in sentiment representation and distribution, which could impact the performance and generalization capabilities of sentiment analysis models trained on the training dataset and evaluated on the validation dataset. Therefore, it becomes crucial to consider and address these variations during model development and evaluation to ensure robust performance across different sentiment categories.") elif page == "Prediction": st.title(" Prediction ") # # Prediction bar for user input # with st.expander("User Input", expanded=True): # # Create placeholders for user input # user_input = {} # # Create input fields for each selected feature # for feature in selected_columns: # st.markdown(f"

{feature}

", unsafe_allow_html=True) # user_input[feature] = st.text_input("", key=feature) # # Convert user input to DataFrame # input_df = pd.DataFrame([user_input]) # # Keep only selected features # input_df = input_df[selected_columns] # # Predict button # if st.button('Predict'): # prediction = pipeline.predict(input_df) # st.write('Prediction:', prediction[0])