Spaces:

LeonceNsh
/

measuring-discourse-infuence

Running

App Files Files Community

measuring-discourse-infuence / app.py

LeonceNsh

Update app.py

0d3acf8 verified 3 months ago

raw

history blame contribute delete

23.1 kB

	import gradio as gr
	import pandas as pd
	import plotly.express as px
	import plotly.graph_objects as go
	from plotly.subplots import make_subplots
	import numpy as np
	from datetime import datetime

	# Load data
	guest_by_channel = pd.read_csv("data/table_guests_by_channel.csv").rename(columns=str.lower)
	topic_summaries = pd.read_csv("data/table_topic_summaries.csv").rename(columns=str.lower)
	videos_by_topic = pd.read_csv("data/videos_by_topic.csv").rename(columns=str.lower)
	youtube_metadata = pd.read_csv("data/youtube_metadata.tsv", sep="\t").rename(columns=str.lower)
	guest_timeline = pd.read_csv("data/guest_timeline.tsv", sep="\t").rename(columns=str.lower)

	# Cleaning and preprocessing
	guest_by_channel.columns = guest_by_channel.columns.str.replace(' ', '_')
	topic_summaries.columns = topic_summaries.columns.str.replace(' ', '_')

	# Convert date columns to datetime
	youtube_metadata['video_publish_date'] = pd.to_datetime(youtube_metadata['video_publish_date'])
	guest_timeline['video_publish_date'] = pd.to_datetime(guest_timeline['video_publish_date'])
	videos_by_topic['video_publish_date'] = pd.to_datetime(videos_by_topic['video_publish_date'])

	# Create derived metrics
	guest_by_channel['avg_views_per_channel'] = guest_by_channel['views_sum'] / guest_by_channel['no_of_channels']
	guest_by_channel = guest_by_channel.sort_values('views_sum', ascending=False)

	# Create channel list for filtering
	channel_opts = [col for col in guest_by_channel.columns if col in
	['adin_live', 'flagrant', 'full_send_podcast', 'impaulsive',
	'lex_fridman', 'pbd_podcast', 'powerfuljre', 'shawn_ryan_show', 'theo_von']]

	# Create topic list for filtering
	topic_opts = [col for col in topic_summaries.columns if col.startswith('#')]

	# Create category list for filtering
	categories = sorted(guest_by_channel['category'].unique().tolist())

	# ---- DASHBOARD COMPONENTS ----

	def executive_summary():
	# Calculate key metrics
	total_guests = len(guest_by_channel)
	total_views = guest_by_channel['views_sum'].sum()
	avg_views_per_guest = total_views / total_guests
	female_guests = guest_by_channel[guest_by_channel['is_a_woman'] == True].shape[0]
	female_pct = (female_guests / total_guests) * 100

	# Top performing categories by views
	category_views = guest_by_channel.groupby('category')['views_sum'].sum().sort_values(ascending=False)

	# Top performing channels by guest appearances
	channel_appearances = {}
	for channel in channel_opts:
	channel_appearances[channel] = guest_by_channel[guest_by_channel[channel] == 1].shape[0]

	# Create summary visualizations
	fig = make_subplots(
	rows=2, cols=2,
	specs=[[{"type": "indicator"}, {"type": "indicator"}],
	[{"type": "xy"}, {"type": "bar"}]],
	subplot_titles=("Total Guest Views (M)", "Avg Views per Guest (M)",
	"Guest Count by Category", "Views by Category")
	)

	# Add indicator traces
	fig.add_trace(
	go.Indicator(
	mode="number",
	value=total_views / 1_000_000,
	number={"suffix": "M", "valueformat": ".1f"},
	title={"text": "Total Views"}
	),
	row=1, col=1
	)

	fig.add_trace(
	go.Indicator(
	mode="number",
	value=avg_views_per_guest / 1_000_000,
	number={"suffix": "M", "valueformat": ".1f"},
	title={"text": "Avg Views per Guest"}
	),
	row=1, col=2
	)

	# Add bar chart for guest counts per category
	guest_counts = guest_by_channel['category'].value_counts().loc[category_views.index]
	fig.add_trace(
	go.Bar(
	x=guest_counts.index,
	y=guest_counts.values,
	marker_color='teal',
	text=guest_counts.values,
	textposition='auto',
	name='Guest Count by Category'
	),
	row=2, col=1
	)

	# Add bar chart for total views per category
	fig.add_trace(
	go.Bar(
	x=category_views.index,
	y=category_views.values,
	marker_color='indianred',
	text=category_views.values,
	textposition='auto',
	name='Views by Category'
	),
	row=2, col=2
	)

	fig.update_layout(
	height=600,
	title_text="Executive Dashboard - Key Performance Metrics",
	showlegend=False
	)

	return fig

	# 2. Guest Performance Analysis
	def guest_performance_analysis(top_n=20, category_filter=None, gender_filter=None):
	# Filter data based on inputs
	filtered_data = guest_by_channel.copy()

	if category_filter and category_filter != "All Categories":
	filtered_data = filtered_data[filtered_data['category'] == category_filter]

	if gender_filter == "Female":
	filtered_data = filtered_data[filtered_data['is_a_woman'] == True]
	elif gender_filter == "Male":
	filtered_data = filtered_data[filtered_data['is_a_woman'] == False]

	# Get top N guests
	top_guests = filtered_data.head(top_n)

	# Create visualization
	fig = px.bar(
	top_guests,
	x='guest',
	y='views_sum',
	color='category',
	hover_data=['no_of_channels', 'avg_views_per_channel'],
	labels={
	'guest': 'Guest Name',
	'views_sum': 'Total Views',
	'category': 'Guest Category',
	'no_of_channels': 'Number of Channels',
	'avg_views_per_channel': 'Avg Views per Channel'
	},
	title=f'Top {top_n} Guests by Total Views',
	height=600
	)

	fig.update_layout(
	xaxis_title="Guest",
	yaxis_title="Total Views",
	xaxis={'categoryorder':'total descending'},
	yaxis=dict(tickformat=".2s")
	)

	# Rotate x-axis labels for better readability
	fig.update_xaxes(tickangle=45)

	return fig

	# 3. Channel Comparison
	def channel_comparison(selected_channels, metric="guest_count"):
	if not selected_channels:
	selected_channels = channel_opts[:3] # Default to first 3 channels

	# Prepare data based on selected metric
	if metric == "guest_count":
	# Count guests per channel
	channel_data = {channel: guest_by_channel[guest_by_channel[channel] == 1].shape[0] for channel in selected_channels}
	title = "Number of Guests per Channel"
	y_label = "Guest Count"

	elif metric == "total_views":
	# Sum views per channel
	channel_data = {channel: guest_by_channel[guest_by_channel[channel] == 1]['views_sum'].sum() for channel in selected_channels}
	title = "Total Views per Channel"
	y_label = "Total Views"

	elif metric == "avg_views":
	# Average views per guest per channel
	channel_data = {channel: guest_by_channel[guest_by_channel[channel] == 1]['views_sum'].mean() for channel in selected_channels}
	title = "Average Views per Guest per Channel"
	y_label = "Average Views"

	elif metric == "category_diversity":
	# Category diversity per channel (number of unique categories)
	channel_data = {channel: len(guest_by_channel[guest_by_channel[channel] == 1]['category'].unique()) for channel in selected_channels}
	title = "Category Diversity per Channel"
	y_label = "Number of Unique Categories"

	# Create visualization
	fig = px.bar(
	x=list(channel_data.keys()),
	y=list(channel_data.values()),
	labels={'x': 'Channel', 'y': y_label},
	title=title,
	height=500
	)

	# Format y-axis for views
	if metric in ["total_views", "avg_views"]:
	fig.update_layout(yaxis=dict(tickformat=".2s"))

	return fig

	# 4. Topic Trend Analysis
	def topic_trend_analysis(selected_topics, time_period="all"):
	if not selected_topics:
	selected_topics = topic_opts[:3] # Default to first 3 topics

	# Filter data based on time period
	filtered_data = videos_by_topic.copy()

	if time_period == "last_year":
	one_year_ago = pd.Timestamp.now() - pd.DateOffset(years=1)
	filtered_data = filtered_data[filtered_data['video_publish_date'] >= one_year_ago]
	elif time_period == "last_6_months":
	six_months_ago = pd.Timestamp.now() - pd.DateOffset(months=6)
	filtered_data = filtered_data[filtered_data['video_publish_date'] >= six_months_ago]

	# Group by month and calculate topic frequency
	filtered_data['month'] = filtered_data['video_publish_date'].dt.to_period('M')

	# Create dataframe for visualization
	topic_trends = []

	for topic in selected_topics:
	if topic in filtered_data.columns:
	monthly_data = filtered_data.groupby('month')[topic].mean().reset_index()
	monthly_data['topic'] = topic
	monthly_data['month'] = monthly_data['month'].dt.to_timestamp()
	topic_trends.append(monthly_data)

	if not topic_trends:
	return go.Figure().update_layout(title="No data available for selected topics")

	trend_df = pd.concat(topic_trends)

	# Create visualization
	fig = px.line(
	trend_df,
	x='month',
	y=topic,
	color='topic',
	labels={
	'month': 'Month',
	topic: 'Topic Frequency',
	'topic': 'Topic'
	},
	title='Topic Trends Over Time',
	height=500
	)

	return fig

	# 5. Guest Category ROI Analysis
	def guest_category_roi(metric="views_per_appearance"):
	# Calculate metrics by category
	category_metrics = guest_by_channel.groupby('category').agg(
	total_views=('views_sum', 'sum'),
	guest_count=('guest', 'count'),
	total_appearances=('no_of_channels', 'sum')
	).reset_index()

	# Calculate derived metrics
	category_metrics['views_per_guest'] = category_metrics['total_views'] / category_metrics['guest_count']
	category_metrics['views_per_appearance'] = category_metrics['total_views'] / category_metrics['total_appearances']

	# Select metric for visualization
	if metric == "views_per_guest":
	y_value = 'views_per_guest'
	title = 'Views per Guest by Category'
	y_label = 'Views per Guest'
	else: # views_per_appearance
	y_value = 'views_per_appearance'
	title = 'Views per Appearance by Category'
	y_label = 'Views per Appearance'

	# Create visualization
	fig = px.bar(
	category_metrics.sort_values(y_value, ascending=False),
	x='category',
	y=y_value,
	color='guest_count',
	text='guest_count',
	labels={
	'category': 'Guest Category',
	y_value: y_label,
	'guest_count': 'Number of Guests'
	},
	title=title,
	height=500
	)

	fig.update_layout(yaxis=dict(tickformat=".2s"))

	return fig

	# 6. Content Strategy Recommendations
	def content_strategy_recommendations(selected_topics=None):
	if not selected_topics:
	selected_topics = topic_opts[:5] # Default to first 5 topics

	# Calculate engagement metrics for videos by topic
	topic_engagement = {}

	for topic in selected_topics:
	if topic in videos_by_topic.columns:
	# Filter videos that cover this topic
	topic_videos = videos_by_topic[videos_by_topic[topic] > 0]

	if not topic_videos.empty:
	# Calculate metrics
	avg_views = topic_videos['video_view_count'].mean()
	avg_likes = topic_videos['video_like_count'].mean()
	avg_comments = topic_videos['video_comment_count'].mean()

	# Calculate engagement rate (likes + comments) / views
	engagement_rate = (avg_likes + avg_comments) / avg_views if avg_views > 0 else 0

	topic_engagement[topic] = {
	'avg_views': avg_views,
	'avg_likes': avg_likes,
	'avg_comments': avg_comments,
	'engagement_rate': engagement_rate
	}

	# Create dataframe for visualization
	engagement_df = pd.DataFrame.from_dict(topic_engagement, orient='index').reset_index()
	engagement_df.rename(columns={'index': 'topic'}, inplace=True)

	if engagement_df.empty:
	return go.Figure().update_layout(title="No data available for selected topics")

	# Create visualization
	fig = make_subplots(
	rows=1, cols=2,
	specs=[[{"type": "bar"}, {"type": "scatter"}]],
	subplot_titles=("Average Views by Topic", "Engagement Analysis")
	)

	# Add average views bar chart
	fig.add_trace(
	go.Bar(
	x=engagement_df['topic'],
	y=engagement_df['avg_views'],
	name='Avg Views'
	),
	row=1, col=1
	)

	# Add engagement scatter plot
	fig.add_trace(
	go.Scatter(
	x=engagement_df['avg_views'],
	y=engagement_df['engagement_rate'],
	mode='markers+text',
	text=engagement_df['topic'],
	textposition="top center",
	marker=dict(
	size=engagement_df['avg_comments'] / 100, # Size based on comment count
	sizemin=10,
	sizemode='area'
	),
	name='Engagement Rate'
	),
	row=1, col=2
	)

	fig.update_layout(
	height=500,
	title_text="Content Strategy Analysis by Topic",
	showlegend=False
	)

	fig.update_yaxes(title_text="Average Views", row=1, col=1)
	fig.update_yaxes(title_text="Engagement Rate (Likes+Comments)/Views", row=1, col=2)
	fig.update_xaxes(title_text="Topic", row=1, col=1)
	fig.update_xaxes(title_text="Average Views", row=1, col=2)

	return fig

	# 7. Guest Timeline Analysis
	def guest_timeline_analysis(selected_guest, view_type="views"):
	if not selected_guest:
	# Default to highest viewed guest
	selected_guest = guest_by_channel.iloc[0]['guest']

	# Filter data for selected guest
	guest_data = guest_timeline[guest_timeline['guest'] == selected_guest].copy()

	if guest_data.empty:
	return go.Figure().update_layout(title=f"No timeline data available for {selected_guest}")

	# Sort by date
	guest_data = guest_data.sort_values('video_publish_date')

	# Create visualization based on view type
	if view_type == "views":
	fig = px.line(
	guest_data,
	x='video_publish_date',
	y='video_view_count',
	color='channel_title',
	markers=True,
	labels={
	'video_publish_date': 'Date',
	'video_view_count': 'Views',
	'channel_title': 'Channel'
	},
	title=f'View Count Timeline for {selected_guest}',
	height=500
	)

	# Add average line
	avg_views = guest_data['video_view_count'].mean()
	fig.add_hline(y=avg_views, line_dash="dash", line_color="gray",
	annotation_text=f"Avg: {avg_views:.0f} views")

	else: # cumulative
	guest_data = guest_data.sort_values('video_publish_date')
	guest_data['cumulative_views'] = guest_data['video_view_count'].cumsum()

	fig = px.line(
	guest_data,
	x='video_publish_date',
	y='cumulative_views',
	markers=True,
	labels={
	'video_publish_date': 'Date',
	'cumulative_views': 'Cumulative Views'
	},
	title=f'Cumulative Views for {selected_guest}',
	height=500
	)

	return fig

	# 8. Channel Growth Analysis
	def channel_growth_analysis(selected_channels):
	if not selected_channels:
	selected_channels = channel_opts[:3] # Default to first 3 channels

	# Filter metadata for selected channels
	channel_data = youtube_metadata[youtube_metadata['channel_title'].str.lower().isin([ch.replace('_', ' ') for ch in selected_channels])]

	if channel_data.empty:
	return go.Figure().update_layout(title="No data available for selected channels")

	# Group by channel and month
	channel_data['month'] = channel_data['video_publish_date'].dt.to_period('M')
	monthly_stats = channel_data.groupby(['channel_title', 'month']).agg(
	avg_views=('video_view_count', 'mean'),
	video_count=('video_id', 'count')
	).reset_index()

	monthly_stats['month'] = monthly_stats['month'].dt.to_timestamp()

	# Create visualization
	fig = make_subplots(
	rows=1, cols=2,
	specs=[[{"type": "scatter"}, {"type": "bar"}]],
	subplot_titles=("Average Views per Video Over Time", "Monthly Video Production")
	)

	# Add average views line chart
	for channel in monthly_stats['channel_title'].unique():
	channel_monthly = monthly_stats[monthly_stats['channel_title'] == channel]

	fig.add_trace(
	go.Scatter(
	x=channel_monthly['month'],
	y=channel_monthly['avg_views'],
	mode='lines+markers',
	name=channel
	),
	row=1, col=1
	)

	fig.add_trace(
	go.Bar(
	x=channel_monthly['month'],
	y=channel_monthly['video_count'],
	name=channel
	),
	row=1, col=2
	)

	fig.update_layout(
	height=500,
	title_text="Channel Growth Analysis",
	legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1)
	)

	fig.update_yaxes(title_text="Average Views per Video", row=1, col=1)
	fig.update_yaxes(title_text="Number of Videos", row=1, col=2)
	fig.update_xaxes(title_text="Month", row=1, col=1)
	fig.update_xaxes(title_text="Month", row=1, col=2)

	return fig

	# ---- GRADIO INTERFACE ----

	with gr.Blocks(theme=gr.themes.Soft()) as demo:
	gr.Markdown("# 📊 YouTube Content Strategy & Analytics Dashboard")
	gr.Markdown("### Business Intelligence for Content Strategy and Guest Selection")

	with gr.Tab("Executive Summary"):
	gr.Markdown("### Key Performance Indicators and Business Overview")
	exec_summary_plot = gr.Plot()
	gr.Button("Generate Executive Summary").click(fn=executive_summary, outputs=exec_summary_plot)

	with gr.Tab("Guest Performance"):
	gr.Markdown("### Guest Performance Analysis")
	with gr.Row():
	with gr.Column(scale=1):
	top_n = gr.Slider(minimum=5, maximum=50, value=20, step=5, label="Number of Guests")
	category_filter = gr.Dropdown(choices=["All Categories"] + categories, value="All Categories", label="Filter by Category")
	gender_filter = gr.Dropdown(choices=["All", "Male", "Female"], value="All", label="Filter by Gender")
	guest_perf_btn = gr.Button("Analyze Guest Performance")

	with gr.Column(scale=3):
	guest_perf_plot = gr.Plot()

	guest_perf_btn.click(
	fn=guest_performance_analysis,
	inputs=[top_n, category_filter, gender_filter],
	outputs=guest_perf_plot
	)

	with gr.Tab("Channel Analysis"):
	gr.Markdown("### Channel Comparison and Performance")
	with gr.Row():
	with gr.Column(scale=1):
	channel_select = gr.CheckboxGroup(choices=channel_opts, value=channel_opts[:3], label="Select Channels")
	metric_select = gr.Radio(
	choices=["guest_count", "total_views", "avg_views", "category_diversity"],
	value="total_views",
	label="Comparison Metric"
	)
	channel_btn = gr.Button("Compare Channels")

	with gr.Column(scale=3):
	channel_plot = gr.Plot()

	channel_btn.click(
	fn=channel_comparison,
	inputs=[channel_select, metric_select],
	outputs=channel_plot
	)

	with gr.Tab("Topic Trends"):
	gr.Markdown("### Topic Trend Analysis")
	with gr.Row():
	with gr.Column(scale=1):
	topic_select = gr.CheckboxGroup(choices=topic_opts, value=topic_opts[:3], label="Select Topics")
	time_period = gr.Radio(
	choices=["all", "last_year", "last_6_months"],
	value="all",
	label="Time Period"
	)
	topic_btn = gr.Button("Analyze Topic Trends")

	with gr.Column(scale=3):
	topic_plot = gr.Plot()

	topic_btn.click(
	fn=topic_trend_analysis,
	inputs=[topic_select, time_period],
	outputs=topic_plot
	)

	with gr.Tab("ROI Analysis"):
	gr.Markdown("### Return on Investment by Guest Category")
	with gr.Row():
	with gr.Column(scale=1):
	roi_metric = gr.Radio(
	choices=["views_per_appearance", "views_per_guest"],
	value="views_per_appearance",
	label="ROI Metric"
	)
	roi_btn = gr.Button("Calculate ROI")

	with gr.Column(scale=3):
	roi_plot = gr.Plot()

	roi_btn.click(
	fn=guest_category_roi,
	inputs=[roi_metric],
	outputs=roi_plot
	)


	with gr.Tab("Guest Timeline"):
	gr.Markdown("### Guest Performance Timeline")
	with gr.Row():
	with gr.Column(scale=1):
	guest_select = gr.Dropdown(choices=sorted(guest_by_channel['guest'].unique().tolist(), reverse=True), label="Select Guest")
	timeline_type = gr.Radio(
	choices=["views", "cumulative"],
	value="views",
	label="Timeline View"
	)
	timeline_btn = gr.Button("Analyze Timeline")

	with gr.Column(scale=3):
	timeline_plot = gr.Plot()

	timeline_btn.click(
	fn=guest_timeline_analysis,
	inputs=[guest_select, timeline_type],
	outputs=timeline_plot
	)

	with gr.Tab("Channel Growth"):
	gr.Markdown("### Channel Growth Analysis")
	with gr.Row():
	with gr.Column(scale=1):
	growth_channels = gr.CheckboxGroup(choices=channel_opts, value=channel_opts[:3], label="Select Channels")
	growth_btn = gr.Button("Analyze Growth")

	with gr.Column(scale=3):
	growth_plot = gr.Plot()

	growth_btn.click(
	fn=channel_growth_analysis,
	inputs=[growth_channels],
	outputs=growth_plot
	)

	if __name__ == "__main__":
	demo.launch()