File size: 6,587 Bytes
9d8db88
 
 
 
282b6be
9d8db88
 
282b6be
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9d8db88
 
282b6be
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9d8db88
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import streamlit as st
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import zscore

# Set up the Streamlit page
st.title("Data Visualization")
st.sidebar.header("Upload and Navigation")

# File upload
uploaded_file = st.sidebar.file_uploader("Upload your Excel file", type=["xlsx"])

if uploaded_file:
    # Load the dataset
    data = pd.read_excel(uploaded_file)
    st.sidebar.success("File uploaded successfully!")
    st.write("### Dataset Overview")
    st.dataframe(data.head())

    # Navigation options
    options = st.sidebar.selectbox("Select Visualization", [
        "Distribution of a Column",
        "Top-N Categories by Aggregated Value",
        "Heatmap of Numerical Data",
        "Outlier Detection",
        "Box Plot Comparison",
        "Time Series Analysis",
        "Stacked Bar Chart"
    ])

    # Task 1: Distribution of a Column
    if options == "Distribution of a Column":
        st.header("Distribution of a Column")
        column = st.selectbox("Select Column for Distribution", data.columns)
        if data[column].dtype in ['int64', 'float64', 'object']:
            fig, ax = plt.subplots(figsize=(8, 6))
            if data[column].dtype == 'object':
                sns.countplot(data=data, x=column, palette='coolwarm', ax=ax)
            else:
                sns.histplot(data=data, x=column, kde=True, color='blue', ax=ax)
            ax.set_title(f'Distribution of {column}')
            st.pyplot(fig)
        else:
            st.error("Selected column is not suitable for distribution visualization.")

    # Task 2: Top-N Categories by Aggregated Value
    elif options == "Top-N Categories by Aggregated Value":
        st.header("Top-N Categories by Aggregated Value")
        category_column = st.selectbox("Select Category Column", data.columns)
        numeric_column = st.selectbox("Select Numeric Column", data.columns)
        n = st.slider("Select Top-N Categories", 1, 20, 5)
        if data[category_column].dtype == 'object' and data[numeric_column].dtype in ['int64', 'float64']:
            grouped_data = data.groupby(category_column)[numeric_column].sum().reset_index()
            top_n = grouped_data.sort_values(by=numeric_column, ascending=False).head(n)
            fig, ax = plt.subplots(figsize=(10, 6))
            sns.barplot(data=top_n, x=numeric_column, y=category_column, palette='viridis', ax=ax)
            ax.set_title(f'Top-{n} {category_column} by {numeric_column}')
            st.pyplot(fig)
        else:
            st.error("Ensure you select a categorical column and a numeric column.")

    # Task 3: Heatmap of Numerical Data
    elif options == "Heatmap of Numerical Data":
        st.header("Heatmap of Numerical Data")
        numeric_columns = data.select_dtypes(include=['int64', 'float64']).columns
        if len(numeric_columns) > 1:
            heatmap_data = data[numeric_columns].corr()
            fig, ax = plt.subplots(figsize=(10, 6))
            sns.heatmap(heatmap_data, annot=True, cmap='YlGnBu', ax=ax)
            ax.set_title("Correlation Heatmap")
            st.pyplot(fig)
        else:
            st.error("Dataset does not have enough numerical columns for a heatmap.")

    # Task 4: Outlier Detection
    elif options == "Outlier Detection":
        st.header("Outlier Detection")
        numeric_column = st.selectbox("Select Numeric Column for Outlier Detection", data.select_dtypes(include=['int64', 'float64']).columns)
        threshold = st.slider("Select Z-Score Threshold", 1.0, 5.0, 3.0)
        data['Z_Score'] = zscore(data[numeric_column])
        outliers = data[data['Z_Score'].abs() > threshold]
        fig, ax = plt.subplots(figsize=(10, 6))
        sns.scatterplot(data=data, x=data.index, y=numeric_column, label='Data', color='blue', ax=ax)
        sns.scatterplot(data=outliers, x=outliers.index, y=numeric_column, label='Outliers', color='red', ax=ax)
        ax.set_title(f'Outlier Detection in {numeric_column}')
        ax.legend()
        st.pyplot(fig)

    # Task 5: Box Plot Comparison
    elif options == "Box Plot Comparison":
        st.header("Box Plot Comparison")
        category_column = st.selectbox("Select Categorical Column", data.columns)
        numeric_column = st.selectbox("Select Numeric Column", data.columns)
        if data[category_column].dtype == 'object' and data[numeric_column].dtype in ['int64', 'float64']:
            fig, ax = plt.subplots(figsize=(12, 8))
            sns.boxplot(data=data, x=category_column, y=numeric_column, palette='Set2', ax=ax)
            ax.set_title(f'Box Plot of {numeric_column} by {category_column}')
            st.pyplot(fig)
        else:
            st.error("Ensure you select a categorical column and a numeric column.")

    # Task 6: Time Series Analysis
    elif options == "Time Series Analysis":
        st.header("Time Series Analysis")
        if 'Time' in data.columns:
            data['Time'] = pd.to_datetime(data['Time'])
            time_column = st.selectbox("Select Time Column", ['Time'])
            value_column = st.selectbox("Select Value Column", data.select_dtypes(include=['int64', 'float64']).columns)
            time_data = data.groupby(data[time_column].dt.to_period('M'))[value_column].sum().reset_index()
            time_data[time_column] = time_data[time_column].dt.to_timestamp()
            fig, ax = plt.subplots(figsize=(12, 8))
            sns.lineplot(data=time_data, x=time_column, y=value_column, ax=ax, marker='o')
            ax.set_title(f'Time Series Analysis of {value_column}')
            st.pyplot(fig)
        else:
            st.error("The dataset does not have a time column.")

    # Task 7: Stacked Bar Chart
    elif options == "Stacked Bar Chart":
        st.header("Stacked Bar Chart")
        category_column = st.selectbox("Select Categorical Column", data.columns)
        numeric_columns = st.multiselect("Select Numeric Columns", data.select_dtypes(include=['int64', 'float64']).columns)
        if len(numeric_columns) > 1 and category_column:
            grouped_data = data.groupby(category_column)[numeric_columns].sum()
            fig, ax = plt.subplots(figsize=(12, 8))
            grouped_data.plot(kind='bar', stacked=True, ax=ax, colormap='coolwarm')
            ax.set_title(f'Stacked Bar Chart of {", ".join(numeric_columns)} by {category_column}')
            st.pyplot(fig)
        else:
            st.error("Ensure you select one categorical column and multiple numeric columns.")

else:
    st.warning("Please upload an Excel file to begin.")