Spaces:

aikanava
/

wine_quality_pca_explorer

Sleeping

App Files Files Community

aikanava commited on Apr 28

Commit

2baa578

1 Parent(s): 3d623ec

upload files

Browse files

Files changed (3) hide show

app.py +121 -0
requirements.txt +5 -0
winequality-red.csv +0 -0

app.py ADDED Viewed

	@@ -0,0 +1,121 @@

+# app.py
+import streamlit as st
+import pandas as pd
+import numpy as np
+from sklearn.decomposition import PCA
+from sklearn.preprocessing import StandardScaler
+import matplotlib.pyplot as plt
+import seaborn as sns
+# Streamlit page setup
+st.set_page_config(page_title="PCA Explorer - Wine Quality", page_icon="🍷", layout="wide")
+# Title and short description
+st.title("🍷 Principal Component Analysis (PCA) on Wine Quality Dataset")
+st.write("""
+This app demonstrates **Principal Component Analysis (PCA)** for dimensionality reduction and visualization of the **Wine Quality Dataset**.
+""")
+# Load Wine Quality dataset (local file)
+@st.cache_data
+def load_data():
+    data = pd.read_csv("winequality-red.csv")  # Make sure the dataset is named correctly
+    return data
+data = load_data()
+# Sidebar settings
+st.sidebar.header("Settings")
+n_components = st.sidebar.slider("Select number of PCA components", 2, min(data.shape[1], 10), 2)
+# Features selection (all numeric columns except 'quality')
+features = data.drop(columns=['quality'])
+# Standardize the data
+scaler = StandardScaler()
+scaled_features = scaler.fit_transform(features)
+# Perform PCA
+pca = PCA(n_components=n_components)
+principal_components = pca.fit_transform(scaled_features)
+# Create DataFrame for PCA result
+pca_df = pd.DataFrame(
+    data=principal_components,
+    columns=[f"PC{i+1}" for i in range(n_components)]
+)
+# Add the 'quality' column to the PCA DataFrame
+pca_df['Quality'] = data['quality']
+# Tabs
+tab1, tab2, tab3, tab4 = st.tabs(["📄 Raw Dataset", "📉 PCA Scatter Plot", "📈 Explained Variance", "📥 Download Reduced Dataset"])
+with tab1:
+    st.subheader("📄 Raw Dataset")
+    st.dataframe(data)
+with tab2:
+    st.subheader("📉 PCA Scatter Plot")
+    if n_components >= 2:
+        fig, ax = plt.subplots()
+        sns.scatterplot(
+            x="PC1",
+            y="PC2",
+            data=pca_df,
+            hue="Quality",
+            palette="viridis",
+            s=70,
+            edgecolor="black",
+            alpha=0.7
+        )
+        ax.set_xlabel("Principal Component 1")
+        ax.set_ylabel("Principal Component 2")
+        ax.set_title("PCA - First Two Components")
+        st.pyplot(fig)
+        st.write("""
+        The scatter plot above shows how the wine samples are distributed in the space of the first two principal components.
+        Points are colored based on their **wine quality**, which ranges from 3 (poor) to 8 (excellent).
+        - **Clusters**: Notice how wines of similar quality tend to group together in the plot.
+        - **Separation**: High-quality wines (higher quality scores) tend to be more spread out, while lower-quality wines are often more tightly clustered.
+        """)
+    else:
+        st.warning("Please select at least 2 components to plot a scatter plot.")
+with tab3:
+    st.subheader("📈 Explained Variance Ratio")
+    exp_var = pca.explained_variance_ratio_
+    fig2, ax2 = plt.subplots()
+    sns.barplot(x=[f"PC{i+1}" for i in range(n_components)], y=exp_var, color="skyblue", ax=ax2)
+    ax2.set_ylabel('Explained Variance Ratio')
+    ax2.set_xlabel('Principal Components')
+    ax2.set_title('Variance Explained by Each Principal Component')
+    st.pyplot(fig2)
+    st.markdown(f"**Total Variance Explained:** {np.sum(exp_var):.2f}")
+    st.write("""
+    The bar plot shows the **explained variance ratio** of each principal component.
+    - **Higher variance** means that component carries more information.
+    - In this case, the first few components explain the majority of the variance in the dataset, with later components contributing less.
+    - By selecting fewer components, we reduce dimensionality but still retain most of the data's information.
+    """)
+with tab4:
+    st.subheader("📥 Download Reduced Dataset")
+    st.write("You can download the PCA-reduced dataset as a CSV file.")
+    # Create a CSV for the PCA-reduced data
+    pca_reduced = pca_df.to_csv(index=False)
+    st.download_button(
+        label="Download PCA Reduced Data",
+        data=pca_reduced,
+        file_name="pca_reduced_wine_quality.csv",
+        mime="text/csv"
+    )
+# Footer
+st.markdown("---")
+st.caption("Made with ❤️ using Streamlit")

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+streamlit
+pandas
+scikit-learn
+seaborn
+matplotlib

winequality-red.csv ADDED Viewed

The diff for this file is too large to render. See raw diff