UML_assignment / app.py
Peter512's picture
Upload app.py
89048a0 verified
import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import warnings
warnings.filterwarnings('ignore')
# Page configuration
st.set_page_config(
page_title="Spotify Playlist Optimizer",
page_icon="🎵",
layout="wide",
initial_sidebar_state="expanded"
)
# Custom CSS for better styling
st.markdown("""
<style>
.main > div {
padding-top: 2rem;
}
.stMetric > div > div > div > div {
font-size: 1rem;
}
.cluster-header {
background: linear-gradient(90deg, #1DB954, #1ed760);
color: white;
padding: 10px;
border-radius: 5px;
text-align: center;
margin-bottom: 20px;
}
</style>
""", unsafe_allow_html=True)
@st.cache_data
def load_and_process_data():
"""Load and process Spotify data with clustering"""
# Load data
spotify_url = 'https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-01-21/spotify_songs.csv'
df = pd.read_csv(spotify_url)
# Audio features for analysis
audio_features = [
'danceability', 'energy', 'speechiness', 'acousticness',
'instrumentalness', 'liveness', 'valence', 'tempo',
'duration_ms', 'loudness', 'key', 'mode'
]
# Clean data
df_clean = df.drop_duplicates(subset=['track_name', 'track_artist'], keep='first')
# Remove outliers
outlier_conditions = (
(df_clean['duration_ms'] > 30000) &
(df_clean['duration_ms'] < 600000) &
(df_clean['tempo'] > 50) &
(df_clean['tempo'] < 200) &
(df_clean['track_popularity'] > 0)
)
df_clean = df_clean[outlier_conditions]
# Remove missing values
df_clean = df_clean.dropna(subset=audio_features)
# Scale features
scaler = StandardScaler()
features_scaled = scaler.fit_transform(df_clean[audio_features])
# Apply PCA
pca = PCA()
pca_results = pca.fit_transform(features_scaled)
# Clustering
n_components = 5
kmeans = KMeans(n_clusters=6, random_state=42, n_init=10)
clusters = kmeans.fit_predict(pca_results[:, :n_components])
# Add results to dataframe
df_final = df_clean.copy()
df_final['Cluster'] = clusters
df_final['PC1'] = pca_results[:, 0]
df_final['PC2'] = pca_results[:, 1]
df_final['PC3'] = pca_results[:, 2]
# Cluster names based on characteristics
cluster_names = {
0: "Energetic Mainstream",
1: "Acoustic Chill",
2: "High-Energy Party",
3: "Moody & Introspective",
4: "Workout & Motivation",
5: "Focus & Background"
}
df_final['Cluster_Name'] = df_final['Cluster'].map(cluster_names)
return df_final, pca, scaler, audio_features, cluster_names
def create_cluster_profile(df, cluster_id, audio_features):
"""Create detailed cluster profile"""
cluster_data = df[df['Cluster'] == cluster_id]
overall_stats = df[audio_features].mean()
cluster_stats = cluster_data[audio_features].mean()
# Calculate differences
differences = []
for feature in audio_features:
diff_pct = ((cluster_stats[feature] - overall_stats[feature]) / overall_stats[feature]) * 100
if abs(diff_pct) > 10: # Only significant differences
differences.append({
'feature': feature.replace('_', ' ').title(),
'value': cluster_stats[feature],
'diff_pct': diff_pct
})
differences.sort(key=lambda x: abs(x['diff_pct']), reverse=True)
return {
'size': len(cluster_data),
'avg_popularity': cluster_data['track_popularity'].mean(),
'top_genres': cluster_data['playlist_genre'].value_counts().head(3),
'differences': differences,
'sample_tracks': cluster_data.nlargest(5, 'track_popularity')[['track_name', 'track_artist', 'track_popularity']]
}
def main():
# Load data
df, pca, scaler, audio_features, cluster_names = load_and_process_data()
# Header
st.title("🎵 Spotify Playlist Optimizer")
st.markdown("### Data-Driven Solutions for Music Engagement")
# Business problem statement
with st.expander("📊 Business Problem & Solution", expanded=True):
col1, col2 = st.columns(2)
with col1:
st.markdown("""
**The Challenge:**
- Streaming platforms face high skip rates that impact user engagement
- Traditional genre-based grouping fails in real contexts
- Poor playlist flow leads to user disengagement
- Lost revenue from subscription churn
""")
with col2:
st.markdown("""
**Our Solution:**
- Audio feature-based clustering identifies 6 playlist types
- Data-driven curation reduces skip rates
- Context-aware recommendations improve engagement
- Actionable insights for streaming platforms
""")
# Sidebar controls
st.sidebar.header("🎛️ Explore Clusters")
# Control 1: Cluster Selection
selected_cluster = st.sidebar.selectbox(
"Select Playlist Category:",
options=list(cluster_names.keys()),
format_func=lambda x: f"{cluster_names[x]} (Cluster {x})",
index=2 # Default to High-Energy Party
)
# Control 2: Audio Feature Focus
focus_feature = st.sidebar.selectbox(
"Focus Audio Feature:",
options=['energy', 'danceability', 'valence', 'acousticness', 'tempo'],
index=0
)
# Control 3: Popularity Filter
min_popularity = st.sidebar.slider(
"Minimum Track Popularity:",
min_value=0,
max_value=100,
value=20,
step=10
)
# Control 4: Genre Filter
available_genres = df['playlist_genre'].unique()
selected_genres = st.sidebar.multiselect(
"Filter by Genres:",
options=available_genres,
default=available_genres
)
# Filter data based on controls
filtered_df = df[
(df['track_popularity'] >= min_popularity) &
(df['playlist_genre'].isin(selected_genres))
]
# Main content area
col1, col2 = st.columns([2, 1])
with col1:
# Visualization 1: Cluster scatter plot
st.subheader("🎯 Playlist Categories in Audio Space")
fig = px.scatter(
filtered_df,
x='PC1',
y='PC2',
color='Cluster_Name',
size=focus_feature,
hover_data=['track_name', 'track_artist', 'track_popularity'],
title=f"Playlist Categories (sized by {focus_feature.title()})",
width=700,
height=500
)
# Highlight selected cluster
selected_cluster_data = filtered_df[filtered_df['Cluster'] == selected_cluster]
fig.add_scatter(
x=selected_cluster_data['PC1'],
y=selected_cluster_data['PC2'],
mode='markers',
marker=dict(color='red', size=12, symbol='diamond', line=dict(color='white', width=2)),
name=f'Selected: {cluster_names[selected_cluster]}',
showlegend=True
)
fig.update_layout(
xaxis_title="PC1: Energy Spectrum (High-Energy ← → Acoustic)",
yaxis_title="PC2: Mood Dimension (Positive ← → Introspective)"
)
st.plotly_chart(fig, use_container_width=True)
with col2:
# Key metrics for selected cluster
cluster_profile = create_cluster_profile(filtered_df, selected_cluster, audio_features)
st.markdown(f"""
<div class="cluster-header">
<h3>{cluster_names[selected_cluster]}</h3>
</div>
""", unsafe_allow_html=True)
st.metric("Tracks in Category", f"{cluster_profile['size']:,}")
st.metric("Avg Popularity", f"{cluster_profile['avg_popularity']:.1f}/100")
st.metric("Market Share", f"{cluster_profile['size']/len(filtered_df)*100:.1f}%")
# Visualization 2: Audio feature radar chart
st.subheader("📊 Audio DNA Profile")
col1, col2 = st.columns(2)
with col1:
# Radar chart for selected cluster
cluster_data = filtered_df[filtered_df['Cluster'] == selected_cluster]
radar_features = ['danceability', 'energy', 'valence', 'acousticness', 'speechiness', 'liveness']
cluster_means = cluster_data[radar_features].mean()
overall_means = filtered_df[radar_features].mean()
fig = go.Figure()
fig.add_trace(go.Scatterpolar(
r=cluster_means.values,
theta=[f.title() for f in radar_features],
fill='toself',
name=cluster_names[selected_cluster],
line_color='#1DB954'
))
fig.add_trace(go.Scatterpolar(
r=overall_means.values,
theta=[f.title() for f in radar_features],
fill='toself',
name='Overall Average',
line_color='gray',
opacity=0.5
))
fig.update_layout(
polar=dict(
radialaxis=dict(
visible=True,
range=[0, 1]
)),
showlegend=True,
title="Cluster vs Overall Average"
)
st.plotly_chart(fig, use_container_width=True)
with col2:
# Distinctive characteristics
st.write("**Key Characteristics:**")
for diff in cluster_profile['differences'][:5]:
direction = "📈" if diff['diff_pct'] > 0 else "📉"
st.write(f"{direction} **{diff['feature']}**: {diff['value']:.3f} ({diff['diff_pct']:+.1f}%)")
st.write("**Top Genres:**")
for genre, count in cluster_profile['top_genres'].items():
percentage = (count / cluster_profile['size']) * 100
st.write(f"• {genre}: {percentage:.1f}%")
# Visualization 3: Feature distribution comparison
st.subheader("🎵 Feature Deep Dive")
fig = make_subplots(
rows=1, cols=2,
subplot_titles=(f'{focus_feature.title()} Distribution', 'All Clusters Comparison')
)
# Distribution plot
cluster_focus = filtered_df[filtered_df['Cluster'] == selected_cluster][focus_feature]
other_focus = filtered_df[filtered_df['Cluster'] != selected_cluster][focus_feature]
fig.add_trace(
go.Histogram(x=cluster_focus, name=cluster_names[selected_cluster], opacity=0.7, nbinsx=30),
row=1, col=1
)
fig.add_trace(
go.Histogram(x=other_focus, name='Other Clusters', opacity=0.5, nbinsx=30),
row=1, col=1
)
# Box plot comparison
for cluster_id in cluster_names.keys():
cluster_data = filtered_df[filtered_df['Cluster'] == cluster_id]
fig.add_trace(
go.Box(y=cluster_data[focus_feature], name=cluster_names[cluster_id],
boxmean=True, marker_color='red' if cluster_id == selected_cluster else None),
row=1, col=2
)
fig.update_layout(height=400, showlegend=True)
st.plotly_chart(fig, use_container_width=True)
# Dynamic Insights
st.subheader("💡 Dynamic Business Insights")
col1, col2 = st.columns(2)
with col1:
st.markdown("**Category Strategy:**")
market_share = cluster_profile['size'] / len(filtered_df)
if market_share > 0.20:
strategy = "MARKET LEADER"
recommendation = "Focus on differentiation and premium sub-segments"
elif market_share > 0.12:
strategy = "GROWTH OPPORTUNITY"
recommendation = "Expand content library and increase user awareness"
else:
strategy = "NICHE EXCELLENCE"
recommendation = "Perfect the experience for dedicated users"
st.success(f"**{strategy}**")
st.write(recommendation)
# Skip risk assessment
avg_popularity = cluster_profile['avg_popularity']
if avg_popularity > 60:
skip_risk = "LOW"
risk_color = "green"
elif avg_popularity > 40:
skip_risk = "MEDIUM"
risk_color = "orange"
else:
skip_risk = "HIGH"
risk_color = "red"
st.markdown(f"**Skip Risk**: :{risk_color}[{skip_risk}]")
with col2:
st.markdown("**Sample Popular Tracks:**")
for i, (_, track) in enumerate(cluster_profile['sample_tracks'].head(3).iterrows(), 1):
st.write(f"{i}. **{track['track_name']}** - {track['track_artist']} (Pop: {track['track_popularity']})")
# Context recommendations
st.markdown("**Best Use Cases:**")
use_cases = {
0: ["Background listening", "Casual playlists"],
1: ["Coffee shops", "Study sessions", "Relaxation"],
2: ["Parties", "Clubs", "High-intensity workouts"],
3: ["Evening listening", "Emotional moments"],
4: ["Gym workouts", "Running", "Motivation"],
5: ["Work", "Focus sessions", "Ambient background"]
}
for use_case in use_cases.get(selected_cluster, ["General listening"]):
st.write(f"• {use_case}")
# Summary recommendations
st.subheader("🎯 Actionable Recommendations")
recommendations = [
"**Algorithm Enhancement**: Use cluster boundaries for better song transitions",
"**Playlist Curation**: Create context-specific playlists based on cluster profiles",
"**User Interface**: Implement audio feature sliders for personalized discovery",
"**Skip Prediction**: Monitor cross-cluster jumps to predict skip likelihood",
"**Revenue Optimization**: Target B2B licensing for specific cluster use cases"
]
for rec in recommendations:
st.write(f"• {rec}")
# Footer
st.markdown("---")
st.markdown("""
**Key Insight**: This analysis reveals that audio features, not genres, determine playlist compatibility.
By clustering songs based on their acoustic DNA, we can reduce skip rates and improve user engagement
through data-driven curation.
""")
if __name__ == "__main__":
main()