|
import streamlit as st
|
|
import pandas as pd
|
|
import numpy as np
|
|
import matplotlib.pyplot as plt
|
|
import seaborn as sns
|
|
from sklearn.cluster import KMeans
|
|
from sklearn.preprocessing import StandardScaler
|
|
from sklearn.decomposition import PCA
|
|
|
|
|
|
|
|
st.title("Unsupervised Data Clustering App")
|
|
|
|
|
|
with st.expander("About this App"):
|
|
st.write(
|
|
"This app allows you to upload any type of unlabeled dataset "
|
|
"and automatically clusters the data using K-means clustering. "
|
|
"It visualizes the clusters using PCA and provides time series and cluster distribution plots "
|
|
"to help you identify patterns and groupings within your data."
|
|
)
|
|
|
|
|
|
uploaded_file = st.file_uploader("Upload Custom CSV file", type=["csv"])
|
|
|
|
|
|
if st.button("Test With An Example Dataset"):
|
|
dataset = load_dataset('kheejay88/country_data', split='train')
|
|
df = pd.DataFrame(dataset)
|
|
st.success("Loaded example dataset from Hugging Face.")
|
|
|
|
with st.expander("Dataset Columns"):
|
|
st.write("""
|
|
**country** – Name of the country\n
|
|
**child_mort** – Death of children under 5 years of age per 1000 live births\n
|
|
**exports** – Exports of goods and services per capita (as a percentage of GDP)\n
|
|
**health** – Total health spending per capita (as a percentage of GDP)\n
|
|
**imports** – Imports of goods and services per capita (as a percentage of GDP)\n
|
|
**income** – Net income per person\n
|
|
**inflation** – Annual inflation rate (percentage)\n
|
|
**life_expec** – Average life expectancy at birth (in years)\n
|
|
**total_fer** – Total fertility rate (average number of children per woman)\n
|
|
**gdpp** – GDP per capita\n
|
|
""")
|
|
|
|
if uploaded_file is not None:
|
|
df = pd.read_csv(uploaded_file)
|
|
|
|
if 'df' in locals():
|
|
|
|
categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
|
|
df.drop(columns=categorical_cols, inplace=True)
|
|
st.write("### Raw Data:")
|
|
st.write(df.head())
|
|
|
|
|
|
scaler = StandardScaler()
|
|
scaled_data = scaler.fit_transform(df)
|
|
|
|
|
|
num_clusters = st.slider("Select number of clusters", min_value=2, max_value=10, value=3)
|
|
|
|
|
|
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
|
|
clusters = kmeans.fit_predict(scaled_data)
|
|
df['Cluster'] = clusters
|
|
|
|
|
|
pca = PCA(n_components=2)
|
|
pca_data = pca.fit_transform(scaled_data)
|
|
df['PCA1'] = pca_data[:, 0]
|
|
df['PCA2'] = pca_data[:, 1]
|
|
|
|
|
|
st.write("### Cluster Visualization:")
|
|
fig, ax = plt.subplots()
|
|
sns.scatterplot(x='PCA1', y='PCA2', hue='Cluster', data=df, palette='viridis', ax=ax)
|
|
st.pyplot(fig)
|
|
|
|
|
|
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
|
|
if len(numeric_cols) >= 2:
|
|
selected_col = st.selectbox("Select column for time series visualization", numeric_cols)
|
|
st.write("### Time Series Plot:")
|
|
fig, ax = plt.subplots()
|
|
for cluster in df['Cluster'].unique():
|
|
cluster_data = df[df['Cluster'] == cluster]
|
|
ax.plot(cluster_data.index, cluster_data[selected_col], label=f'Cluster {cluster}')
|
|
ax.legend()
|
|
st.pyplot(fig)
|
|
|
|
|
|
st.write("### Cluster Distribution:")
|
|
fig, ax = plt.subplots()
|
|
sns.countplot(x='Cluster', data=df, palette='viridis', ax=ax)
|
|
st.pyplot(fig)
|
|
|