LeonceNsh's picture
Update app.py
0d3acf8 verified
import gradio as gr
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np
from datetime import datetime
# Load data
guest_by_channel = pd.read_csv("data/table_guests_by_channel.csv").rename(columns=str.lower)
topic_summaries = pd.read_csv("data/table_topic_summaries.csv").rename(columns=str.lower)
videos_by_topic = pd.read_csv("data/videos_by_topic.csv").rename(columns=str.lower)
youtube_metadata = pd.read_csv("data/youtube_metadata.tsv", sep="\t").rename(columns=str.lower)
guest_timeline = pd.read_csv("data/guest_timeline.tsv", sep="\t").rename(columns=str.lower)
# Cleaning and preprocessing
guest_by_channel.columns = guest_by_channel.columns.str.replace(' ', '_')
topic_summaries.columns = topic_summaries.columns.str.replace(' ', '_')
# Convert date columns to datetime
youtube_metadata['video_publish_date'] = pd.to_datetime(youtube_metadata['video_publish_date'])
guest_timeline['video_publish_date'] = pd.to_datetime(guest_timeline['video_publish_date'])
videos_by_topic['video_publish_date'] = pd.to_datetime(videos_by_topic['video_publish_date'])
# Create derived metrics
guest_by_channel['avg_views_per_channel'] = guest_by_channel['views_sum'] / guest_by_channel['no_of_channels']
guest_by_channel = guest_by_channel.sort_values('views_sum', ascending=False)
# Create channel list for filtering
channel_opts = [col for col in guest_by_channel.columns if col in
['adin_live', 'flagrant', 'full_send_podcast', 'impaulsive',
'lex_fridman', 'pbd_podcast', 'powerfuljre', 'shawn_ryan_show', 'theo_von']]
# Create topic list for filtering
topic_opts = [col for col in topic_summaries.columns if col.startswith('#')]
# Create category list for filtering
categories = sorted(guest_by_channel['category'].unique().tolist())
# ---- DASHBOARD COMPONENTS ----
def executive_summary():
# Calculate key metrics
total_guests = len(guest_by_channel)
total_views = guest_by_channel['views_sum'].sum()
avg_views_per_guest = total_views / total_guests
female_guests = guest_by_channel[guest_by_channel['is_a_woman'] == True].shape[0]
female_pct = (female_guests / total_guests) * 100
# Top performing categories by views
category_views = guest_by_channel.groupby('category')['views_sum'].sum().sort_values(ascending=False)
# Top performing channels by guest appearances
channel_appearances = {}
for channel in channel_opts:
channel_appearances[channel] = guest_by_channel[guest_by_channel[channel] == 1].shape[0]
# Create summary visualizations
fig = make_subplots(
rows=2, cols=2,
specs=[[{"type": "indicator"}, {"type": "indicator"}],
[{"type": "xy"}, {"type": "bar"}]],
subplot_titles=("Total Guest Views (M)", "Avg Views per Guest (M)",
"Guest Count by Category", "Views by Category")
)
# Add indicator traces
fig.add_trace(
go.Indicator(
mode="number",
value=total_views / 1_000_000,
number={"suffix": "M", "valueformat": ".1f"},
title={"text": "Total Views"}
),
row=1, col=1
)
fig.add_trace(
go.Indicator(
mode="number",
value=avg_views_per_guest / 1_000_000,
number={"suffix": "M", "valueformat": ".1f"},
title={"text": "Avg Views per Guest"}
),
row=1, col=2
)
# Add bar chart for guest counts per category
guest_counts = guest_by_channel['category'].value_counts().loc[category_views.index]
fig.add_trace(
go.Bar(
x=guest_counts.index,
y=guest_counts.values,
marker_color='teal',
text=guest_counts.values,
textposition='auto',
name='Guest Count by Category'
),
row=2, col=1
)
# Add bar chart for total views per category
fig.add_trace(
go.Bar(
x=category_views.index,
y=category_views.values,
marker_color='indianred',
text=category_views.values,
textposition='auto',
name='Views by Category'
),
row=2, col=2
)
fig.update_layout(
height=600,
title_text="Executive Dashboard - Key Performance Metrics",
showlegend=False
)
return fig
# 2. Guest Performance Analysis
def guest_performance_analysis(top_n=20, category_filter=None, gender_filter=None):
# Filter data based on inputs
filtered_data = guest_by_channel.copy()
if category_filter and category_filter != "All Categories":
filtered_data = filtered_data[filtered_data['category'] == category_filter]
if gender_filter == "Female":
filtered_data = filtered_data[filtered_data['is_a_woman'] == True]
elif gender_filter == "Male":
filtered_data = filtered_data[filtered_data['is_a_woman'] == False]
# Get top N guests
top_guests = filtered_data.head(top_n)
# Create visualization
fig = px.bar(
top_guests,
x='guest',
y='views_sum',
color='category',
hover_data=['no_of_channels', 'avg_views_per_channel'],
labels={
'guest': 'Guest Name',
'views_sum': 'Total Views',
'category': 'Guest Category',
'no_of_channels': 'Number of Channels',
'avg_views_per_channel': 'Avg Views per Channel'
},
title=f'Top {top_n} Guests by Total Views',
height=600
)
fig.update_layout(
xaxis_title="Guest",
yaxis_title="Total Views",
xaxis={'categoryorder':'total descending'},
yaxis=dict(tickformat=".2s")
)
# Rotate x-axis labels for better readability
fig.update_xaxes(tickangle=45)
return fig
# 3. Channel Comparison
def channel_comparison(selected_channels, metric="guest_count"):
if not selected_channels:
selected_channels = channel_opts[:3] # Default to first 3 channels
# Prepare data based on selected metric
if metric == "guest_count":
# Count guests per channel
channel_data = {channel: guest_by_channel[guest_by_channel[channel] == 1].shape[0] for channel in selected_channels}
title = "Number of Guests per Channel"
y_label = "Guest Count"
elif metric == "total_views":
# Sum views per channel
channel_data = {channel: guest_by_channel[guest_by_channel[channel] == 1]['views_sum'].sum() for channel in selected_channels}
title = "Total Views per Channel"
y_label = "Total Views"
elif metric == "avg_views":
# Average views per guest per channel
channel_data = {channel: guest_by_channel[guest_by_channel[channel] == 1]['views_sum'].mean() for channel in selected_channels}
title = "Average Views per Guest per Channel"
y_label = "Average Views"
elif metric == "category_diversity":
# Category diversity per channel (number of unique categories)
channel_data = {channel: len(guest_by_channel[guest_by_channel[channel] == 1]['category'].unique()) for channel in selected_channels}
title = "Category Diversity per Channel"
y_label = "Number of Unique Categories"
# Create visualization
fig = px.bar(
x=list(channel_data.keys()),
y=list(channel_data.values()),
labels={'x': 'Channel', 'y': y_label},
title=title,
height=500
)
# Format y-axis for views
if metric in ["total_views", "avg_views"]:
fig.update_layout(yaxis=dict(tickformat=".2s"))
return fig
# 4. Topic Trend Analysis
def topic_trend_analysis(selected_topics, time_period="all"):
if not selected_topics:
selected_topics = topic_opts[:3] # Default to first 3 topics
# Filter data based on time period
filtered_data = videos_by_topic.copy()
if time_period == "last_year":
one_year_ago = pd.Timestamp.now() - pd.DateOffset(years=1)
filtered_data = filtered_data[filtered_data['video_publish_date'] >= one_year_ago]
elif time_period == "last_6_months":
six_months_ago = pd.Timestamp.now() - pd.DateOffset(months=6)
filtered_data = filtered_data[filtered_data['video_publish_date'] >= six_months_ago]
# Group by month and calculate topic frequency
filtered_data['month'] = filtered_data['video_publish_date'].dt.to_period('M')
# Create dataframe for visualization
topic_trends = []
for topic in selected_topics:
if topic in filtered_data.columns:
monthly_data = filtered_data.groupby('month')[topic].mean().reset_index()
monthly_data['topic'] = topic
monthly_data['month'] = monthly_data['month'].dt.to_timestamp()
topic_trends.append(monthly_data)
if not topic_trends:
return go.Figure().update_layout(title="No data available for selected topics")
trend_df = pd.concat(topic_trends)
# Create visualization
fig = px.line(
trend_df,
x='month',
y=topic,
color='topic',
labels={
'month': 'Month',
topic: 'Topic Frequency',
'topic': 'Topic'
},
title='Topic Trends Over Time',
height=500
)
return fig
# 5. Guest Category ROI Analysis
def guest_category_roi(metric="views_per_appearance"):
# Calculate metrics by category
category_metrics = guest_by_channel.groupby('category').agg(
total_views=('views_sum', 'sum'),
guest_count=('guest', 'count'),
total_appearances=('no_of_channels', 'sum')
).reset_index()
# Calculate derived metrics
category_metrics['views_per_guest'] = category_metrics['total_views'] / category_metrics['guest_count']
category_metrics['views_per_appearance'] = category_metrics['total_views'] / category_metrics['total_appearances']
# Select metric for visualization
if metric == "views_per_guest":
y_value = 'views_per_guest'
title = 'Views per Guest by Category'
y_label = 'Views per Guest'
else: # views_per_appearance
y_value = 'views_per_appearance'
title = 'Views per Appearance by Category'
y_label = 'Views per Appearance'
# Create visualization
fig = px.bar(
category_metrics.sort_values(y_value, ascending=False),
x='category',
y=y_value,
color='guest_count',
text='guest_count',
labels={
'category': 'Guest Category',
y_value: y_label,
'guest_count': 'Number of Guests'
},
title=title,
height=500
)
fig.update_layout(yaxis=dict(tickformat=".2s"))
return fig
# 6. Content Strategy Recommendations
def content_strategy_recommendations(selected_topics=None):
if not selected_topics:
selected_topics = topic_opts[:5] # Default to first 5 topics
# Calculate engagement metrics for videos by topic
topic_engagement = {}
for topic in selected_topics:
if topic in videos_by_topic.columns:
# Filter videos that cover this topic
topic_videos = videos_by_topic[videos_by_topic[topic] > 0]
if not topic_videos.empty:
# Calculate metrics
avg_views = topic_videos['video_view_count'].mean()
avg_likes = topic_videos['video_like_count'].mean()
avg_comments = topic_videos['video_comment_count'].mean()
# Calculate engagement rate (likes + comments) / views
engagement_rate = (avg_likes + avg_comments) / avg_views if avg_views > 0 else 0
topic_engagement[topic] = {
'avg_views': avg_views,
'avg_likes': avg_likes,
'avg_comments': avg_comments,
'engagement_rate': engagement_rate
}
# Create dataframe for visualization
engagement_df = pd.DataFrame.from_dict(topic_engagement, orient='index').reset_index()
engagement_df.rename(columns={'index': 'topic'}, inplace=True)
if engagement_df.empty:
return go.Figure().update_layout(title="No data available for selected topics")
# Create visualization
fig = make_subplots(
rows=1, cols=2,
specs=[[{"type": "bar"}, {"type": "scatter"}]],
subplot_titles=("Average Views by Topic", "Engagement Analysis")
)
# Add average views bar chart
fig.add_trace(
go.Bar(
x=engagement_df['topic'],
y=engagement_df['avg_views'],
name='Avg Views'
),
row=1, col=1
)
# Add engagement scatter plot
fig.add_trace(
go.Scatter(
x=engagement_df['avg_views'],
y=engagement_df['engagement_rate'],
mode='markers+text',
text=engagement_df['topic'],
textposition="top center",
marker=dict(
size=engagement_df['avg_comments'] / 100, # Size based on comment count
sizemin=10,
sizemode='area'
),
name='Engagement Rate'
),
row=1, col=2
)
fig.update_layout(
height=500,
title_text="Content Strategy Analysis by Topic",
showlegend=False
)
fig.update_yaxes(title_text="Average Views", row=1, col=1)
fig.update_yaxes(title_text="Engagement Rate (Likes+Comments)/Views", row=1, col=2)
fig.update_xaxes(title_text="Topic", row=1, col=1)
fig.update_xaxes(title_text="Average Views", row=1, col=2)
return fig
# 7. Guest Timeline Analysis
def guest_timeline_analysis(selected_guest, view_type="views"):
if not selected_guest:
# Default to highest viewed guest
selected_guest = guest_by_channel.iloc[0]['guest']
# Filter data for selected guest
guest_data = guest_timeline[guest_timeline['guest'] == selected_guest].copy()
if guest_data.empty:
return go.Figure().update_layout(title=f"No timeline data available for {selected_guest}")
# Sort by date
guest_data = guest_data.sort_values('video_publish_date')
# Create visualization based on view type
if view_type == "views":
fig = px.line(
guest_data,
x='video_publish_date',
y='video_view_count',
color='channel_title',
markers=True,
labels={
'video_publish_date': 'Date',
'video_view_count': 'Views',
'channel_title': 'Channel'
},
title=f'View Count Timeline for {selected_guest}',
height=500
)
# Add average line
avg_views = guest_data['video_view_count'].mean()
fig.add_hline(y=avg_views, line_dash="dash", line_color="gray",
annotation_text=f"Avg: {avg_views:.0f} views")
else: # cumulative
guest_data = guest_data.sort_values('video_publish_date')
guest_data['cumulative_views'] = guest_data['video_view_count'].cumsum()
fig = px.line(
guest_data,
x='video_publish_date',
y='cumulative_views',
markers=True,
labels={
'video_publish_date': 'Date',
'cumulative_views': 'Cumulative Views'
},
title=f'Cumulative Views for {selected_guest}',
height=500
)
return fig
# 8. Channel Growth Analysis
def channel_growth_analysis(selected_channels):
if not selected_channels:
selected_channels = channel_opts[:3] # Default to first 3 channels
# Filter metadata for selected channels
channel_data = youtube_metadata[youtube_metadata['channel_title'].str.lower().isin([ch.replace('_', ' ') for ch in selected_channels])]
if channel_data.empty:
return go.Figure().update_layout(title="No data available for selected channels")
# Group by channel and month
channel_data['month'] = channel_data['video_publish_date'].dt.to_period('M')
monthly_stats = channel_data.groupby(['channel_title', 'month']).agg(
avg_views=('video_view_count', 'mean'),
video_count=('video_id', 'count')
).reset_index()
monthly_stats['month'] = monthly_stats['month'].dt.to_timestamp()
# Create visualization
fig = make_subplots(
rows=1, cols=2,
specs=[[{"type": "scatter"}, {"type": "bar"}]],
subplot_titles=("Average Views per Video Over Time", "Monthly Video Production")
)
# Add average views line chart
for channel in monthly_stats['channel_title'].unique():
channel_monthly = monthly_stats[monthly_stats['channel_title'] == channel]
fig.add_trace(
go.Scatter(
x=channel_monthly['month'],
y=channel_monthly['avg_views'],
mode='lines+markers',
name=channel
),
row=1, col=1
)
fig.add_trace(
go.Bar(
x=channel_monthly['month'],
y=channel_monthly['video_count'],
name=channel
),
row=1, col=2
)
fig.update_layout(
height=500,
title_text="Channel Growth Analysis",
legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1)
)
fig.update_yaxes(title_text="Average Views per Video", row=1, col=1)
fig.update_yaxes(title_text="Number of Videos", row=1, col=2)
fig.update_xaxes(title_text="Month", row=1, col=1)
fig.update_xaxes(title_text="Month", row=1, col=2)
return fig
# ---- GRADIO INTERFACE ----
with gr.Blocks(theme=gr.themes.Soft()) as demo:
gr.Markdown("# 📊 YouTube Content Strategy & Analytics Dashboard")
gr.Markdown("### Business Intelligence for Content Strategy and Guest Selection")
with gr.Tab("Executive Summary"):
gr.Markdown("### Key Performance Indicators and Business Overview")
exec_summary_plot = gr.Plot()
gr.Button("Generate Executive Summary").click(fn=executive_summary, outputs=exec_summary_plot)
with gr.Tab("Guest Performance"):
gr.Markdown("### Guest Performance Analysis")
with gr.Row():
with gr.Column(scale=1):
top_n = gr.Slider(minimum=5, maximum=50, value=20, step=5, label="Number of Guests")
category_filter = gr.Dropdown(choices=["All Categories"] + categories, value="All Categories", label="Filter by Category")
gender_filter = gr.Dropdown(choices=["All", "Male", "Female"], value="All", label="Filter by Gender")
guest_perf_btn = gr.Button("Analyze Guest Performance")
with gr.Column(scale=3):
guest_perf_plot = gr.Plot()
guest_perf_btn.click(
fn=guest_performance_analysis,
inputs=[top_n, category_filter, gender_filter],
outputs=guest_perf_plot
)
with gr.Tab("Channel Analysis"):
gr.Markdown("### Channel Comparison and Performance")
with gr.Row():
with gr.Column(scale=1):
channel_select = gr.CheckboxGroup(choices=channel_opts, value=channel_opts[:3], label="Select Channels")
metric_select = gr.Radio(
choices=["guest_count", "total_views", "avg_views", "category_diversity"],
value="total_views",
label="Comparison Metric"
)
channel_btn = gr.Button("Compare Channels")
with gr.Column(scale=3):
channel_plot = gr.Plot()
channel_btn.click(
fn=channel_comparison,
inputs=[channel_select, metric_select],
outputs=channel_plot
)
with gr.Tab("Topic Trends"):
gr.Markdown("### Topic Trend Analysis")
with gr.Row():
with gr.Column(scale=1):
topic_select = gr.CheckboxGroup(choices=topic_opts, value=topic_opts[:3], label="Select Topics")
time_period = gr.Radio(
choices=["all", "last_year", "last_6_months"],
value="all",
label="Time Period"
)
topic_btn = gr.Button("Analyze Topic Trends")
with gr.Column(scale=3):
topic_plot = gr.Plot()
topic_btn.click(
fn=topic_trend_analysis,
inputs=[topic_select, time_period],
outputs=topic_plot
)
with gr.Tab("ROI Analysis"):
gr.Markdown("### Return on Investment by Guest Category")
with gr.Row():
with gr.Column(scale=1):
roi_metric = gr.Radio(
choices=["views_per_appearance", "views_per_guest"],
value="views_per_appearance",
label="ROI Metric"
)
roi_btn = gr.Button("Calculate ROI")
with gr.Column(scale=3):
roi_plot = gr.Plot()
roi_btn.click(
fn=guest_category_roi,
inputs=[roi_metric],
outputs=roi_plot
)
with gr.Tab("Guest Timeline"):
gr.Markdown("### Guest Performance Timeline")
with gr.Row():
with gr.Column(scale=1):
guest_select = gr.Dropdown(choices=sorted(guest_by_channel['guest'].unique().tolist(), reverse=True), label="Select Guest")
timeline_type = gr.Radio(
choices=["views", "cumulative"],
value="views",
label="Timeline View"
)
timeline_btn = gr.Button("Analyze Timeline")
with gr.Column(scale=3):
timeline_plot = gr.Plot()
timeline_btn.click(
fn=guest_timeline_analysis,
inputs=[guest_select, timeline_type],
outputs=timeline_plot
)
with gr.Tab("Channel Growth"):
gr.Markdown("### Channel Growth Analysis")
with gr.Row():
with gr.Column(scale=1):
growth_channels = gr.CheckboxGroup(choices=channel_opts, value=channel_opts[:3], label="Select Channels")
growth_btn = gr.Button("Analyze Growth")
with gr.Column(scale=3):
growth_plot = gr.Plot()
growth_btn.click(
fn=channel_growth_analysis,
inputs=[growth_channels],
outputs=growth_plot
)
if __name__ == "__main__":
demo.launch()