Spaces:

Peter512
/

UML_assignment

Sleeping

App Files Files Community

UML_assignment / app.py

Peter512

Upload app.py

89048a0 verified 6 months ago

raw

history blame contribute delete

14.8 kB

	import streamlit as st
	import pandas as pd
	import numpy as np
	import matplotlib.pyplot as plt
	import seaborn as sns
	import plotly.express as px
	import plotly.graph_objects as go
	from plotly.subplots import make_subplots
	from sklearn.preprocessing import StandardScaler
	from sklearn.decomposition import PCA
	from sklearn.cluster import KMeans
	from sklearn.metrics import silhouette_score
	import warnings
	warnings.filterwarnings('ignore')

	# Page configuration
	st.set_page_config(
	page_title="Spotify Playlist Optimizer",
	page_icon="🎵",
	layout="wide",
	initial_sidebar_state="expanded"
	)

	# Custom CSS for better styling
	st.markdown("""
	<style>
	.main > div {
	padding-top: 2rem;
	}
	.stMetric > div > div > div > div {
	font-size: 1rem;
	}
	.cluster-header {
	background: linear-gradient(90deg, #1DB954, #1ed760);
	color: white;
	padding: 10px;
	border-radius: 5px;
	text-align: center;
	margin-bottom: 20px;
	}
	</style>
	""", unsafe_allow_html=True)

	@st.cache_data
	def load_and_process_data():
	"""Load and process Spotify data with clustering"""
	# Load data
	spotify_url = 'https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-01-21/spotify_songs.csv'
	df = pd.read_csv(spotify_url)

	# Audio features for analysis
	audio_features = [
	'danceability', 'energy', 'speechiness', 'acousticness',
	'instrumentalness', 'liveness', 'valence', 'tempo',
	'duration_ms', 'loudness', 'key', 'mode'
	]

	# Clean data
	df_clean = df.drop_duplicates(subset=['track_name', 'track_artist'], keep='first')

	# Remove outliers
	outlier_conditions = (
	(df_clean['duration_ms'] > 30000) &
	(df_clean['duration_ms'] < 600000) &
	(df_clean['tempo'] > 50) &
	(df_clean['tempo'] < 200) &
	(df_clean['track_popularity'] > 0)
	)
	df_clean = df_clean[outlier_conditions]

	# Remove missing values
	df_clean = df_clean.dropna(subset=audio_features)

	# Scale features
	scaler = StandardScaler()
	features_scaled = scaler.fit_transform(df_clean[audio_features])

	# Apply PCA
	pca = PCA()
	pca_results = pca.fit_transform(features_scaled)

	# Clustering
	n_components = 5
	kmeans = KMeans(n_clusters=6, random_state=42, n_init=10)
	clusters = kmeans.fit_predict(pca_results[:, :n_components])

	# Add results to dataframe
	df_final = df_clean.copy()
	df_final['Cluster'] = clusters
	df_final['PC1'] = pca_results[:, 0]
	df_final['PC2'] = pca_results[:, 1]
	df_final['PC3'] = pca_results[:, 2]

	# Cluster names based on characteristics
	cluster_names = {
	0: "Energetic Mainstream",
	1: "Acoustic Chill",
	2: "High-Energy Party",
	3: "Moody & Introspective",
	4: "Workout & Motivation",
	5: "Focus & Background"
	}

	df_final['Cluster_Name'] = df_final['Cluster'].map(cluster_names)

	return df_final, pca, scaler, audio_features, cluster_names

	def create_cluster_profile(df, cluster_id, audio_features):
	"""Create detailed cluster profile"""
	cluster_data = df[df['Cluster'] == cluster_id]
	overall_stats = df[audio_features].mean()
	cluster_stats = cluster_data[audio_features].mean()

	# Calculate differences
	differences = []
	for feature in audio_features:
	diff_pct = ((cluster_stats[feature] - overall_stats[feature]) / overall_stats[feature]) * 100
	if abs(diff_pct) > 10: # Only significant differences
	differences.append({
	'feature': feature.replace('_', ' ').title(),
	'value': cluster_stats[feature],
	'diff_pct': diff_pct
	})

	differences.sort(key=lambda x: abs(x['diff_pct']), reverse=True)

	return {
	'size': len(cluster_data),
	'avg_popularity': cluster_data['track_popularity'].mean(),
	'top_genres': cluster_data['playlist_genre'].value_counts().head(3),
	'differences': differences,
	'sample_tracks': cluster_data.nlargest(5, 'track_popularity')[['track_name', 'track_artist', 'track_popularity']]
	}

	def main():
	# Load data
	df, pca, scaler, audio_features, cluster_names = load_and_process_data()

	# Header
	st.title("🎵 Spotify Playlist Optimizer")
	st.markdown("### Data-Driven Solutions for Music Engagement")

	# Business problem statement
	with st.expander("📊 Business Problem & Solution", expanded=True):
	col1, col2 = st.columns(2)

	with col1:
	st.markdown("""
	The Challenge:
	- Streaming platforms face high skip rates that impact user engagement
	- Traditional genre-based grouping fails in real contexts
	- Poor playlist flow leads to user disengagement
	- Lost revenue from subscription churn
	""")

	with col2:
	st.markdown("""
	Our Solution:
	- Audio feature-based clustering identifies 6 playlist types
	- Data-driven curation reduces skip rates
	- Context-aware recommendations improve engagement
	- Actionable insights for streaming platforms
	""")

	# Sidebar controls
	st.sidebar.header("🎛️ Explore Clusters")

	# Control 1: Cluster Selection
	selected_cluster = st.sidebar.selectbox(
	"Select Playlist Category:",
	options=list(cluster_names.keys()),
	format_func=lambda x: f"{cluster_names[x]} (Cluster {x})",
	index=2 # Default to High-Energy Party
	)

	# Control 2: Audio Feature Focus
	focus_feature = st.sidebar.selectbox(
	"Focus Audio Feature:",
	options=['energy', 'danceability', 'valence', 'acousticness', 'tempo'],
	index=0
	)

	# Control 3: Popularity Filter
	min_popularity = st.sidebar.slider(
	"Minimum Track Popularity:",
	min_value=0,
	max_value=100,
	value=20,
	step=10
	)

	# Control 4: Genre Filter
	available_genres = df['playlist_genre'].unique()
	selected_genres = st.sidebar.multiselect(
	"Filter by Genres:",
	options=available_genres,
	default=available_genres
	)

	# Filter data based on controls
	filtered_df = df[
	(df['track_popularity'] >= min_popularity) &
	(df['playlist_genre'].isin(selected_genres))
	]

	# Main content area
	col1, col2 = st.columns([2, 1])

	with col1:
	# Visualization 1: Cluster scatter plot
	st.subheader("🎯 Playlist Categories in Audio Space")

	fig = px.scatter(
	filtered_df,
	x='PC1',
	y='PC2',
	color='Cluster_Name',
	size=focus_feature,
	hover_data=['track_name', 'track_artist', 'track_popularity'],
	title=f"Playlist Categories (sized by {focus_feature.title()})",
	width=700,
	height=500
	)

	# Highlight selected cluster
	selected_cluster_data = filtered_df[filtered_df['Cluster'] == selected_cluster]
	fig.add_scatter(
	x=selected_cluster_data['PC1'],
	y=selected_cluster_data['PC2'],
	mode='markers',
	marker=dict(color='red', size=12, symbol='diamond', line=dict(color='white', width=2)),
	name=f'Selected: {cluster_names[selected_cluster]}',
	showlegend=True
	)

	fig.update_layout(
	xaxis_title="PC1: Energy Spectrum (High-Energy ← → Acoustic)",
	yaxis_title="PC2: Mood Dimension (Positive ← → Introspective)"
	)

	st.plotly_chart(fig, use_container_width=True)

	with col2:
	# Key metrics for selected cluster
	cluster_profile = create_cluster_profile(filtered_df, selected_cluster, audio_features)

	st.markdown(f"""
	<div class="cluster-header">
	<h3>{cluster_names[selected_cluster]}</h3>
	</div>
	""", unsafe_allow_html=True)

	st.metric("Tracks in Category", f"{cluster_profile['size']:,}")
	st.metric("Avg Popularity", f"{cluster_profile['avg_popularity']:.1f}/100")
	st.metric("Market Share", f"{cluster_profile['size']/len(filtered_df)*100:.1f}%")

	# Visualization 2: Audio feature radar chart
	st.subheader("📊 Audio DNA Profile")

	col1, col2 = st.columns(2)

	with col1:
	# Radar chart for selected cluster
	cluster_data = filtered_df[filtered_df['Cluster'] == selected_cluster]
	radar_features = ['danceability', 'energy', 'valence', 'acousticness', 'speechiness', 'liveness']

	cluster_means = cluster_data[radar_features].mean()
	overall_means = filtered_df[radar_features].mean()

	fig = go.Figure()

	fig.add_trace(go.Scatterpolar(
	r=cluster_means.values,
	theta=[f.title() for f in radar_features],
	fill='toself',
	name=cluster_names[selected_cluster],
	line_color='#1DB954'
	))

	fig.add_trace(go.Scatterpolar(
	r=overall_means.values,
	theta=[f.title() for f in radar_features],
	fill='toself',
	name='Overall Average',
	line_color='gray',
	opacity=0.5
	))

	fig.update_layout(
	polar=dict(
	radialaxis=dict(
	visible=True,
	range=[0, 1]
	)),
	showlegend=True,
	title="Cluster vs Overall Average"
	)

	st.plotly_chart(fig, use_container_width=True)

	with col2:
	# Distinctive characteristics
	st.write("Key Characteristics:")
	for diff in cluster_profile['differences'][:5]:
	direction = "📈" if diff['diff_pct'] > 0 else "📉"
	st.write(f"{direction} {diff['feature']}: {diff['value']:.3f} ({diff['diff_pct']:+.1f}%)")

	st.write("Top Genres:")
	for genre, count in cluster_profile['top_genres'].items():
	percentage = (count / cluster_profile['size']) * 100
	st.write(f"• {genre}: {percentage:.1f}%")

	# Visualization 3: Feature distribution comparison
	st.subheader("🎵 Feature Deep Dive")

	fig = make_subplots(
	rows=1, cols=2,
	subplot_titles=(f'{focus_feature.title()} Distribution', 'All Clusters Comparison')
	)

	# Distribution plot
	cluster_focus = filtered_df[filtered_df['Cluster'] == selected_cluster][focus_feature]
	other_focus = filtered_df[filtered_df['Cluster'] != selected_cluster][focus_feature]

	fig.add_trace(
	go.Histogram(x=cluster_focus, name=cluster_names[selected_cluster], opacity=0.7, nbinsx=30),
	row=1, col=1
	)
	fig.add_trace(
	go.Histogram(x=other_focus, name='Other Clusters', opacity=0.5, nbinsx=30),
	row=1, col=1
	)

	# Box plot comparison
	for cluster_id in cluster_names.keys():
	cluster_data = filtered_df[filtered_df['Cluster'] == cluster_id]
	fig.add_trace(
	go.Box(y=cluster_data[focus_feature], name=cluster_names[cluster_id],
	boxmean=True, marker_color='red' if cluster_id == selected_cluster else None),
	row=1, col=2
	)

	fig.update_layout(height=400, showlegend=True)
	st.plotly_chart(fig, use_container_width=True)

	# Dynamic Insights
	st.subheader("💡 Dynamic Business Insights")

	col1, col2 = st.columns(2)

	with col1:
	st.markdown("Category Strategy:")
	market_share = cluster_profile['size'] / len(filtered_df)

	if market_share > 0.20:
	strategy = "MARKET LEADER"
	recommendation = "Focus on differentiation and premium sub-segments"
	elif market_share > 0.12:
	strategy = "GROWTH OPPORTUNITY"
	recommendation = "Expand content library and increase user awareness"
	else:
	strategy = "NICHE EXCELLENCE"
	recommendation = "Perfect the experience for dedicated users"

	st.success(f"{strategy}")
	st.write(recommendation)

	# Skip risk assessment
	avg_popularity = cluster_profile['avg_popularity']
	if avg_popularity > 60:
	skip_risk = "LOW"
	risk_color = "green"
	elif avg_popularity > 40:
	skip_risk = "MEDIUM"
	risk_color = "orange"
	else:
	skip_risk = "HIGH"
	risk_color = "red"

	st.markdown(f"Skip Risk: :{risk_color}[{skip_risk}]")

	with col2:
	st.markdown("Sample Popular Tracks:")
	for i, (_, track) in enumerate(cluster_profile['sample_tracks'].head(3).iterrows(), 1):
	st.write(f"{i}. {track['track_name']} - {track['track_artist']} (Pop: {track['track_popularity']})")

	# Context recommendations
	st.markdown("Best Use Cases:")
	use_cases = {
	0: ["Background listening", "Casual playlists"],
	1: ["Coffee shops", "Study sessions", "Relaxation"],
	2: ["Parties", "Clubs", "High-intensity workouts"],
	3: ["Evening listening", "Emotional moments"],
	4: ["Gym workouts", "Running", "Motivation"],
	5: ["Work", "Focus sessions", "Ambient background"]
	}

	for use_case in use_cases.get(selected_cluster, ["General listening"]):
	st.write(f"• {use_case}")

	# Summary recommendations
	st.subheader("🎯 Actionable Recommendations")

	recommendations = [
	"Algorithm Enhancement: Use cluster boundaries for better song transitions",
	"Playlist Curation: Create context-specific playlists based on cluster profiles",
	"User Interface: Implement audio feature sliders for personalized discovery",
	"Skip Prediction: Monitor cross-cluster jumps to predict skip likelihood",
	"Revenue Optimization: Target B2B licensing for specific cluster use cases"
	]

	for rec in recommendations:
	st.write(f"• {rec}")

	# Footer
	st.markdown("---")
	st.markdown("""
	Key Insight: This analysis reveals that audio features, not genres, determine playlist compatibility.
	By clustering songs based on their acoustic DNA, we can reduce skip rates and improve user engagement
	through data-driven curation.
	""")

	if __name__ == "__main__":
	main()