import streamlit as st import pandas as pd import seaborn as sns import matplotlib.pyplot as plt from scipy.stats import zscore # Set up the Streamlit page st.title("Data Visualization") st.sidebar.header("Upload and Navigation") # File upload uploaded_file = st.sidebar.file_uploader("Upload your Excel file", type=["xlsx"]) if uploaded_file: # Load the dataset data = pd.read_excel(uploaded_file) st.sidebar.success("File uploaded successfully!") st.write("### Dataset Overview") st.dataframe(data.head()) # Navigation options options = st.sidebar.selectbox("Select Visualization", [ "Distribution of a Column", "Top-N Categories by Aggregated Value", "Heatmap of Numerical Data", "Outlier Detection", "Box Plot Comparison", "Time Series Analysis", "Stacked Bar Chart" ]) # Task 1: Distribution of a Column if options == "Distribution of a Column": st.header("Distribution of a Column") column = st.selectbox("Select Column for Distribution", data.columns) if data[column].dtype in ['int64', 'float64', 'object']: fig, ax = plt.subplots(figsize=(8, 6)) if data[column].dtype == 'object': sns.countplot(data=data, x=column, palette='coolwarm', ax=ax) else: sns.histplot(data=data, x=column, kde=True, color='blue', ax=ax) ax.set_title(f'Distribution of {column}') st.pyplot(fig) else: st.error("Selected column is not suitable for distribution visualization.") # Task 2: Top-N Categories by Aggregated Value elif options == "Top-N Categories by Aggregated Value": st.header("Top-N Categories by Aggregated Value") category_column = st.selectbox("Select Category Column", data.columns) numeric_column = st.selectbox("Select Numeric Column", data.columns) n = st.slider("Select Top-N Categories", 1, 20, 5) if data[category_column].dtype == 'object' and data[numeric_column].dtype in ['int64', 'float64']: grouped_data = data.groupby(category_column)[numeric_column].sum().reset_index() top_n = grouped_data.sort_values(by=numeric_column, ascending=False).head(n) fig, ax = plt.subplots(figsize=(10, 6)) sns.barplot(data=top_n, x=numeric_column, y=category_column, palette='viridis', ax=ax) ax.set_title(f'Top-{n} {category_column} by {numeric_column}') st.pyplot(fig) else: st.error("Ensure you select a categorical column and a numeric column.") # Task 3: Heatmap of Numerical Data elif options == "Heatmap of Numerical Data": st.header("Heatmap of Numerical Data") numeric_columns = data.select_dtypes(include=['int64', 'float64']).columns if len(numeric_columns) > 1: heatmap_data = data[numeric_columns].corr() fig, ax = plt.subplots(figsize=(10, 6)) sns.heatmap(heatmap_data, annot=True, cmap='YlGnBu', ax=ax) ax.set_title("Correlation Heatmap") st.pyplot(fig) else: st.error("Dataset does not have enough numerical columns for a heatmap.") # Task 4: Outlier Detection elif options == "Outlier Detection": st.header("Outlier Detection") numeric_column = st.selectbox("Select Numeric Column for Outlier Detection", data.select_dtypes(include=['int64', 'float64']).columns) threshold = st.slider("Select Z-Score Threshold", 1.0, 5.0, 3.0) data['Z_Score'] = zscore(data[numeric_column]) outliers = data[data['Z_Score'].abs() > threshold] fig, ax = plt.subplots(figsize=(10, 6)) sns.scatterplot(data=data, x=data.index, y=numeric_column, label='Data', color='blue', ax=ax) sns.scatterplot(data=outliers, x=outliers.index, y=numeric_column, label='Outliers', color='red', ax=ax) ax.set_title(f'Outlier Detection in {numeric_column}') ax.legend() st.pyplot(fig) # Task 5: Box Plot Comparison elif options == "Box Plot Comparison": st.header("Box Plot Comparison") category_column = st.selectbox("Select Categorical Column", data.columns) numeric_column = st.selectbox("Select Numeric Column", data.columns) if data[category_column].dtype == 'object' and data[numeric_column].dtype in ['int64', 'float64']: fig, ax = plt.subplots(figsize=(12, 8)) sns.boxplot(data=data, x=category_column, y=numeric_column, palette='Set2', ax=ax) ax.set_title(f'Box Plot of {numeric_column} by {category_column}') st.pyplot(fig) else: st.error("Ensure you select a categorical column and a numeric column.") # Task 6: Time Series Analysis elif options == "Time Series Analysis": st.header("Time Series Analysis") if 'Time' in data.columns: data['Time'] = pd.to_datetime(data['Time']) time_column = st.selectbox("Select Time Column", ['Time']) value_column = st.selectbox("Select Value Column", data.select_dtypes(include=['int64', 'float64']).columns) time_data = data.groupby(data[time_column].dt.to_period('M'))[value_column].sum().reset_index() time_data[time_column] = time_data[time_column].dt.to_timestamp() fig, ax = plt.subplots(figsize=(12, 8)) sns.lineplot(data=time_data, x=time_column, y=value_column, ax=ax, marker='o') ax.set_title(f'Time Series Analysis of {value_column}') st.pyplot(fig) else: st.error("The dataset does not have a time column.") # Task 7: Stacked Bar Chart elif options == "Stacked Bar Chart": st.header("Stacked Bar Chart") category_column = st.selectbox("Select Categorical Column", data.columns) numeric_columns = st.multiselect("Select Numeric Columns", data.select_dtypes(include=['int64', 'float64']).columns) if len(numeric_columns) > 1 and category_column: grouped_data = data.groupby(category_column)[numeric_columns].sum() fig, ax = plt.subplots(figsize=(12, 8)) grouped_data.plot(kind='bar', stacked=True, ax=ax, colormap='coolwarm') ax.set_title(f'Stacked Bar Chart of {", ".join(numeric_columns)} by {category_column}') st.pyplot(fig) else: st.error("Ensure you select one categorical column and multiple numeric columns.") else: st.warning("Please upload an Excel file to begin.")