Spaces:

economies-open-ai
/

open-model-evolution

Running

App Files Files Community

open-model-evolution / app.py

emsesc

delete filtered_df

eb34850 29 days ago

raw

history blame

39.6 kB

	from dash import Dash, html, dcc, Input, Output, State
	from dash import Dash, html, dcc, Input, Output, State
	import pandas as pd
	import dash_mantine_components as dmc
	import duckdb
	import time
	from graphs.leaderboard import (
	button_style,
	get_top_n_leaderboard,
	render_table_content,
	)
	from dash_iconify import DashIconify

	# Initialize the app
	app = Dash()
	server = app.server

	def load_parquet_to_duckdb(con, parquet_url, view_name):
	"""
	Loads a parquet file from a remote URL into DuckDB as a view.
	Returns (start_dt, end_dt) for the 'time' column.
	"""
	# Install and load httpfs extension for remote file access
	con.execute("INSTALL httpfs;")
	con.execute("LOAD httpfs;")

	# Create a view that references the remote parquet file
	con.execute(f"""
	CREATE OR REPLACE VIEW {view_name} AS
	SELECT * FROM read_parquet('{parquet_url}')
	""")

	# Get time range for slider
	time_range = con.execute(
	f"SELECT MIN(time) as min_time, MAX(time) as max_time FROM {view_name}"
	).fetchdf()
	start_dt = pd.to_datetime(time_range["min_time"].iloc[0])
	end_dt = pd.to_datetime(time_range["max_time"].iloc[0])
	return start_dt, end_dt

	# DuckDB connection (global)
	con = duckdb.connect(database=":memory:", read_only=False)

	# Load parquet files from Hugging Face using DuckDB
	HF_DATASET_ID = "emsesc/open_model_evolution_data"
	hf_parquet_url_1 = "https://huggingface.co/datasets/emsesc/open_model_evolution_data/resolve/main/all_downloads_with_annotations.parquet"
	hf_parquet_url_2 = "https://huggingface.co/datasets/emsesc/open_model_evolution_data/resolve/main/one_year_rolling.parquet"

	print(f"Attempting to connect to dataset from Hugging Face Hub: {HF_DATASET_ID}")
	try:
	overall_start_time = time.time()

	# Load both parquet files as views
	start_dt, end_dt = load_parquet_to_duckdb(con, hf_parquet_url_1, "all_downloads")
	# Example: load a second parquet file as another view
	start_dt2, end_dt2 = load_parquet_to_duckdb(con, hf_parquet_url_2, "one_year_rolling")

	msg = (
	f"Successfully connected to datasets in {time.time() - overall_start_time:.2f}s."
	)
	print(msg)
	except Exception as e:
	err_msg = f"Failed to load dataset(s). Error: {e}"
	print(err_msg)
	raise

	# Create a dcc slider for time range selection by year (readable marks)
	start_ts = int(start_dt.timestamp())
	end_ts = int(end_dt.timestamp())

	def ordinal(n):
	# Helper to get ordinal suffix for a day
	if 10 <= n % 100 <= 20:
	suffix = 'th'
	else:
	suffix = {1: 'st', 2: 'nd', 3: 'rd'}.get(n % 10, 'th')
	return f"{n}{suffix}"

	def format_date(dt):
	# Format date as "Oct 8th, 2025"
	return dt.strftime("%b") + f" {ordinal(dt.day)}, {dt.year}"

	marks = []
	# Add start label (e.g. "Jan 2020")
	marks.append({"value": start_ts, "label": start_dt.strftime("%b %Y")})
	# Add yearly marks between start and end (e.g. "2021", "2022")
	for yr in range(start_dt.year, end_dt.year + 1):
	yr_ts = int(pd.Timestamp(year=yr, month=1, day=1).timestamp())
	start_yr = int(pd.Timestamp(year=start_dt.year, month=1, day=1).timestamp())
	if yr_ts != start_yr and yr_ts != end_ts:
	marks.append({"value": yr_ts, "label": str(yr)})
	# Add end label (e.g. "Dec 2024")
	marks.append({"value": end_ts, "label": end_dt.strftime("%b %Y")})

	def get_thumb_labels(values):
	# Returns formatted labels for both thumbs
	distance = abs(values[1] - values[0])
	close = distance < 4 * 30 * 86400 # 4 months

	label_style = {
	"background": "#fff",
	"color": "#082030",
	"fontWeight": "bold",
	"fontSize": "13px",
	"borderRadius": "8px",
	"padding": "2px 8px",
	"boxShadow": "0 1px 4px rgba(8,32,48,0.10)",
	"position": "absolute",
	"left": "50%",
	"transform": "translateX(-50%)",
	"whiteSpace": "nowrap",
	"zIndex": 100,
	}

	if close:
	# Move first label above, second label below (closer to slider)
	style_top_1 = label_style.copy()
	style_top_1["top"] = "-38px"
	style_top_2 = label_style.copy()
	style_top_2["top"] = "14px"
	return [
	html.Div(
	format_date(pd.to_datetime(values[0], unit="s")),
	style=style_top_1,
	),
	html.Div(
	format_date(pd.to_datetime(values[1], unit="s")),
	style=style_top_2,
	),
	]
	else:
	# Both labels below the slider (closer to slider)
	style_top_1 = label_style.copy()
	style_top_1["top"] = "14px"
	style_top_2 = label_style.copy()
	style_top_2["top"] = "14px"
	return [
	html.Div(
	format_date(pd.to_datetime(values[0], unit="s")),
	style=style_top_1,
	),
	html.Div(
	format_date(pd.to_datetime(values[1], unit="s")),
	style=style_top_2,
	),
	]

	# Create a dcc slider for time range selection by year
	time_slider = dmc.RangeSlider(
	id="time-slider",
	min=start_ts,
	max=end_ts,
	value=[
	start_ts,
	end_ts,
	],
	step=24 * 60 * 60,
	color="#AC482A",
	size="md",
	radius="xl",
	marks=marks,
	style={"width": "95%", "paddingLeft": "60px"}, # updated paddingLeft
	label=None,
	showLabelOnHover=False,
	labelTransitionProps={"transition": "fade", "duration": 150},
	thumbChildren=get_thumb_labels([start_ts, end_ts]),
	)

	# Add a dcc.Store to hold the selected view (all_downloads or one_year_rolling)
	app.layout = dmc.MantineProvider(
	theme={
	"colorScheme": "light",
	"primaryColor": "blue",
	"fontFamily": "Inter, sans-serif",
	},
	children=[
	dcc.Store(id="selected-view", data="all_downloads"),
	dcc.Store(id="derived-author-toggle", data=True), # Store for toggle state
	html.Div(
	[
	# Header
	html.Div(
	[
	html.Div(
	[
	html.Div(
	[
	html.Div(
	children="Economies of Open Intelligence",
	style={
	"fontSize": 22,
	"fontWeight": "700",
	"lineHeight": "1.1",
	},
	),
	html.Div(
	children="Tracing Power & Participation in the Model Ecosystem",
	style={
	"fontSize": 13,
	"marginTop": 6,
	"opacity": 0.9,
	},
	),
	],
	style={
	"display": "flex",
	"flexDirection": "column",
	"justifyContent": "center",
	},
	),
	html.Div(
	[
	html.A(
	children=[
	html.Img(
	src="assets/images/dpi.svg",
	style={
	"height": "28px",
	"verticalAlign": "middle",
	"paddingRight": "8px",
	},
	),
	"Data Provenance Initiative",
	],
	href="https://www.dataprovenance.org/",
	target="_blank",
	className="no-bg-link header-link",
	style={
	"display": "inline-block",
	"padding": "6px 14px",
	"fontSize": 13,
	"color": "#FFFFFF", # white on dark header
	# background removed so CSS controls it
	"borderRadius": "18px",
	"fontWeight": "700",
	"textDecoration": "none",
	"marginRight": "12px",
	},
	),
	html.A(
	children=[
	html.Img(
	src="assets/images/hf.svg",
	style={
	"height": "30px",
	"verticalAlign": "middle",
	},
	),
	html.Span(
	"Hugging Face",
	className="hf-brand-text",
	),
	],
	href="https://huggingface.co/",
	target="_blank",
	className="no-bg-link header-link",
	style={
	"display": "inline-flex",
	"padding": "6px 14px",
	"alignItems": "center",
	"color": "#FFFFFF",
	"borderRadius": "18px",
	"textDecoration": "none",
	"marginRight": "12px",
	},
	),
	html.A(
	children=[
	html.Span(
	"Read the paper",
	className="paper-text",
	),
	],
	href="https://www.google.com/",
	target="_blank",
	className="no-bg-link header-link paper-link",
	style={
	"display": "inline-flex",
	"alignItems": "center",
	"padding": "6px 12px", # decreased size
	"fontSize": 14, # smaller text
	"margin": "0 auto",
	"backgroundColor": "#AC482A",
	"color": "#FFFFFF",
	"borderRadius": "5px",
	"textDecoration": "none",
	"fontWeight": "700",
	},
	),
	],
	style={"display": "flex", "alignItems": "center"},
	),
	],
	style={
	"marginLeft": "50px",
	"marginRight": "50px",
	"display": "flex",
	"justifyContent": "space-between",
	"alignItems": "center",
	"padding": "18px 24px",
	"gap": "24px",
	},
	),
	],
	style={
	"backgroundColor": "#082030",
	"color": "white",
	"width": "100%",
	},
	),
	# Intro / description below header (kept but styled to match layout)
	# Title
	html.Div(
	children="The Open Model Leaderboard",
	style={
	"fontSize": 40,
	"fontWeight": "700",
	"textAlign": "center",
	"marginTop": 20,
	"marginBottom": 20,
	},
	),
	html.Div(
	children="This leaderboard assesses concentrations of power in the open model ecosystem across three hierarchies: countries, developers, and models. Explore how downloads are distributed among these groups and identify key players shaping the open model ecosystem on Hugging Face.",
	style={
	"fontSize": 14,
	"marginTop": 18,
	"marginBottom": 12,
	"marginLeft": 100,
	"marginRight": 100,
	"textAlign": "center",
	},
	),
	# Main content (filters + tabs)
	html.Div(
	children=[
	html.Div(
	[
	html.Div(
	"Select Download View",
	style={
	"fontWeight": "700",
	"marginBottom": 8,
	"fontSize": 14,
	},
	),
	dmc.SegmentedControl(
	id="segmented",
	value="all-downloads",
	color="#AC482A",
	transitionDuration=200,
	data=[
	{
	"value": "all-downloads",
	"label": "All Downloads",
	},
	{
	"value": "filtered-downloads",
	"label": "Filtered Downloads",
	},
	],
	mb=10,
	),
	html.Div(
	"Choose whether to view all downloads or only those within one year of the model's creation date.",
	style={
	"fontSize": 13,
	"color": "#555",
	"marginBottom": "12px",
	},
	),
	# New segmented control below the first one
	html.Div(
	[
	html.Div(
	"Select Author Type",
	style={
	"fontWeight": "700",
	"marginBottom": 8,
	"fontSize": 14,
	},
	),
	dmc.Switch(
	id="derived-author-switch", # <-- add id
	color="#AC482A",
	label="Derived Authors",
	checked=True,
	mb=10,
	),
	html.Div(
	"Toggle between viewing downloads by original authors or derived authors (those who forked or adapted models).",
	style={
	"fontSize": 13,
	"color": "#555",
	"marginBottom": "12px",
	},
	),
	],
	style={"marginTop": "10px"},
	),
	html.Span(
	id="global-toggle-status",
	style={
	"marginLeft": "8px",
	"display": "inline-block",
	"marginTop": 6,
	},
	),
	],
	style={"flex": 1, "minWidth": "220px"},
	),
	html.Div(
	[
	html.Div(
	"Select Time Range",
	style={
	"fontWeight": "700",
	"marginBottom": 8,
	"fontSize": 14,
	},
	),
	time_slider,
	html.Div(
	"Adjust the time range to filter leaderboard results by model download times.",
	style={
	"fontSize": 13,
	"color": "#555",
	"marginTop": "32px", # increased from 24px
	},
	),
	# Tip section
	html.Div(
	[
	html.Div(
	[
	DashIconify(
	icon="mdi:lightbulb-on-outline",
	width=20,
	height=20,
	style={"marginRight": "8px", "color": "#082030"},
	),
	html.Span("Tip"),
	],
	style={
	"fontWeight": "700",
	"fontSize": 15,
	"marginBottom": "6px",
	"color": "#082030",
	"display": "flex",
	"alignItems": "center",
	},
	),
	html.Div(
	[
	"Try switching between ",
	html.Span("All Downloads", style={"fontWeight": "600", "color": "#AC482A"}),
	" and ",
	html.Span("Filtered Downloads", style={"fontWeight": "600", "color": "#AC482A"}),
	" to compare overall popularity versus early interest after model release. ",
	"You can also toggle ON ",
	html.Span("Derived Authors", style={"fontWeight": "600", "color": "#AC482A"}),
	" to see how derivative works contribute to developer influence.",
	],
	style={
	"fontSize": 13,
	"color": "#082030",
	"lineHeight": "1.6",
	},
	),
	],
	style={
	"backgroundColor": "#F5ECE6",
	"borderRadius": "14px",
	"padding": "18px 20px",
	"marginTop": "28px",
	"boxShadow": "0 1px 4px rgba(8,32,48,0.04)",
	"border": "1px solid #f0e3d6",
	},
	),
	],
	style={
	"flex": 2,
	"minWidth": "320px",
	"display": "flex",
	"flexDirection": "column",
	"justifyContent": "center",
	"height": "100%",
	},
	),
	],
	style={
	"display": "flex",
	"gap": "24px",
	"padding": "32px",
	"alignItems": "flex-start",
	"marginLeft": "100px",
	"marginRight": "100px",
	"backgroundColor": "#FFFBF9",
	"borderRadius": "18px",
	},
	),
	html.Div(
	[
	dcc.Tabs(
	id="leaderboard-tabs",
	value="Countries",
	children=[
	dcc.Tab(
	label="Countries",
	value="Countries",
	style={
	"backgroundColor": "transparent",
	"border": "none",
	"padding": "10px 18px",
	"color": "#6B7280",
	"fontWeight": "500",
	},
	selected_style={
	"backgroundColor": "transparent",
	"border": "none",
	"padding": "10px 18px",
	"fontWeight": "700",
	"borderBottom": "3px solid #082030",
	},
	children=[
	html.Div(
	children="The model leaderboard assesses concentrations of power across three hierarchies: countries, developers, and models. Explore how downloads are distributed among these groups and identify key players shaping the open model ecosystem on Hugging Face.",
	style={
	"fontSize": 14,
	"marginTop": 18,
	"marginBottom": 12,
	"textAlign": "left",
	},
	),
	dcc.Loading(
	id="loading-countries",
	type="circle",
	color="#AC482A",
	children=html.Div(id="top_countries-table")
	),
	html.Button(
	id="top_countries-toggle",
	children="▼ Show Top 50",
	n_clicks=0,
	style={**button_style, "border": "none"},
	),
	],
	),
	dcc.Tab(
	label="Developers",
	value="Developers",
	style={
	"backgroundColor": "transparent",
	"border": "none",
	"padding": "10px 18px",
	"color": "#6B7280",
	"fontWeight": "500",
	},
	selected_style={
	"backgroundColor": "transparent",
	"border": "none",
	"padding": "10px 18px",
	"fontWeight": "700",
	"borderBottom": "3px solid #082030",
	},
	children=[
	html.Div(
	children="The model leaderboard assesses concentrations of power across three hierarchies: countries, developers, and models. Explore how downloads are distributed among these groups and identify key players shaping the open model ecosystem on Hugging Face.",
	style={
	"fontSize": 14,
	"marginTop": 18,
	"marginBottom": 12,
	"textAlign": "left",
	},
	),
	dcc.Loading(
	id="loading-developers",
	type="circle",
	color="#AC482A",
	children=html.Div(id="top_developers-table")
	),
	html.Button(
	id="top_developers-toggle",
	children="▼ Show Top 50",
	n_clicks=0,
	style={**button_style, "border": "none"},
	),
	],
	),
	dcc.Tab(
	label="Models",
	value="Models",
	style={
	"backgroundColor": "transparent",
	"border": "none",
	"padding": "10px 18px",
	"color": "#6B7280",
	"fontWeight": "500",
	},
	selected_style={
	"backgroundColor": "transparent",
	"border": "none",
	"padding": "10px 18px",
	"fontWeight": "700",
	"borderBottom": "3px solid #082030",
	},
	children=[
	html.Div(
	children="The model leaderboard assesses concentrations of power across three hierarchies: countries, developers, and models. Explore how downloads are distributed among these groups and identify key players shaping the open model ecosystem on Hugging Face.",
	style={
	"fontSize": 14,
	"marginTop": 18,
	"marginBottom": 12,
	"textAlign": "left",
	},
	),
	dcc.Loading(
	id="loading-models",
	type="circle",
	color="#AC482A",
	children=html.Div(id="top_models-table")
	),
	html.Button(
	id="top_models-toggle",
	children="▼ Show Top 50",
	n_clicks=0,
	style={**button_style, "border": "none"},
	),
	],
	),
	],
	),
	],
	style={
	"borderRadius": "18px",
	"padding": "32px",
	"marginTop": "12px",
	"marginBottom": "12px", # reduced from 64px
	"marginLeft": "50px",
	"marginRight": "50px",
	},
	),
	],
	style={
	"fontFamily": "Inter",
	"backgroundColor": "#ffffff",
	"minHeight": "100vh",
	},
	)
	],
	)


	# Callbacks for interactivity
	# -- helper utilities to consolidate duplicated callback logic --
	def _get_filtered_top_n_from_duckdb(slider_value, group_col, top_n, view="all_downloads"):
	"""
	Query DuckDB directly to get top N entries with metadata
	This minimizes data transfer by doing aggregation in DuckDB
	"""
	# Build time filter clause
	time_clause = ""
	if slider_value and len(slider_value) == 2:
	start = pd.to_datetime(slider_value[0], unit="s")
	end = pd.to_datetime(slider_value[1], unit="s")
	time_clause = f"WHERE time >= '{start}' AND time <= '{end}'"

	# Build the aggregation query to get top N with all needed metadata
	# This query groups by the target column and aggregates downloads
	# while collecting all metadata we need for chips
	query = f"""
	WITH base_data AS (
	SELECT
	{group_col},
	CASE
	WHEN org_country_single IN ('HF', 'United States of America') THEN 'United States of America'
	WHEN org_country_single IN ('International', 'Online') THEN 'International/Online'
	ELSE org_country_single
	END AS org_country_single,
	author,
	derived_author,
	merged_country_groups_single,
	merged_modality,
	downloads,
	model
	FROM {view}
	{time_clause}
	),

	-- Compute the total downloads for all rows in the time range
	total_downloads_cte AS (
	SELECT SUM(downloads) AS total_downloads_all
	FROM base_data
	),

	-- Compute per-group totals and their percentage of all downloads
	top_items AS (
	SELECT
	b.{group_col} AS name,
	SUM(b.downloads) AS total_downloads,
	ROUND(SUM(b.downloads) * 100.0 / t.total_downloads_all, 2) AS percent_of_total,
	-- Pick first non-null metadata values for reference
	ANY_VALUE(b.org_country_single) AS org_country_single,
	ANY_VALUE(b.author) AS author,
	ANY_VALUE(b.derived_author) AS derived_author,
	ANY_VALUE(b.merged_country_groups_single) AS merged_country_groups_single,
	ANY_VALUE(b.merged_modality) AS merged_modality,
	ANY_VALUE(b.model) AS model
	FROM base_data b
	CROSS JOIN total_downloads_cte t
	GROUP BY b.{group_col}, t.total_downloads_all
	)

	SELECT *
	FROM top_items
	ORDER BY total_downloads DESC
	LIMIT {top_n};
	"""

	return con.execute(query).fetchdf()


	def _leaderboard_callback_logic(
	n_clicks,
	slider_value,
	current_label,
	group_col,
	filename,
	default_label="▼ Show Top 50",
	chip_color="#F0F9FF",
	view="all_downloads",
	derived_author_toggle=True,
	):
	# Normalize label on first load
	if current_label is None:
	current_label = default_label

	# Determine top_n and next label
	if n_clicks == 0:
	top_n = 10
	new_label = current_label
	elif "Show Top 50" in current_label:
	top_n, new_label = 50, "▼ Show Top 100"
	elif "Show Top 100" in current_label:
	top_n, new_label = 100, "▲ Show Less"
	else:
	top_n, new_label = 10, "▼ Show Top 50"

	# Get filtered and aggregated data directly from DuckDB
	df_filtered = _get_filtered_top_n_from_duckdb(slider_value, group_col, top_n, view=view)

	# Process the already-filtered data - pass derived_author_toggle
	df, download_df = get_top_n_leaderboard(df_filtered, group_col, top_n, derived_author_toggle=derived_author_toggle)
	return render_table_content(
	df, download_df, chip_color=chip_color, filename=filename
	), new_label


	# -- end helpers --


	# --- Callback to store derived author toggle state ---
	@app.callback(
	Output("derived-author-toggle", "data"),
	Input("derived-author-switch", "checked"),
	)
	def update_derived_author_toggle(checked):
	return checked


	# Callbacks for interactivity (modularized)
	@app.callback(
	Output("top_countries-table", "children"),
	Output("top_countries-toggle", "children"),
	Input("top_countries-toggle", "n_clicks"),
	Input("time-slider", "value"),
	Input("selected-view", "data"),
	Input("derived-author-toggle", "data"),
	State("top_countries-toggle", "children"),
	)
	def update_top_countries(n_clicks, slider_value, selected_view, derived_author_toggle, current_label):
	return _leaderboard_callback_logic(
	n_clicks,
	slider_value,
	current_label,
	group_col="org_country_single",
	filename="top_countries",
	default_label="▼ Show Top 50",
	chip_color="#F0F9FF",
	view=selected_view,
	derived_author_toggle=derived_author_toggle,
	)


	@app.callback(
	Output("top_developers-table", "children"),
	Output("top_developers-toggle", "children"),
	Input("top_developers-toggle", "n_clicks"),
	Input("time-slider", "value"),
	Input("selected-view", "data"),
	Input("derived-author-toggle", "data"),
	State("top_developers-toggle", "children"),
	)
	def update_top_developers(n_clicks, slider_value, selected_view, derived_author_toggle, current_label):
	# Use derived_author if toggle is True, else author
	group_col = "derived_author" if derived_author_toggle else "author"
	return _leaderboard_callback_logic(
	n_clicks,
	slider_value,
	current_label,
	group_col=group_col,
	filename="top_developers",
	default_label="▼ Show Top 50",
	chip_color="#F0F9FF",
	view=selected_view,
	derived_author_toggle=derived_author_toggle,
	)


	@app.callback(
	Output("top_models-table", "children"),
	Output("top_models-toggle", "children"),
	Input("top_models-toggle", "n_clicks"),
	Input("time-slider", "value"),
	Input("selected-view", "data"),
	Input("derived-author-toggle", "data"),
	State("top_models-toggle", "children"),
	)
	def update_top_models(n_clicks, slider_value, selected_view, derived_author_toggle, current_label):
	return _leaderboard_callback_logic(
	n_clicks,
	slider_value,
	current_label,
	group_col="model",
	filename="top_models",
	default_label="▼ Show More",
	chip_color="#F0F9FF",
	view=selected_view,
	derived_author_toggle=derived_author_toggle,
	)


	@app.callback(
	Output("time-slider", "thumbChildren"),
	Input("time-slider", "value"),
	)
	def update_thumb_labels(values):
	return get_thumb_labels(values)

	# --- Add callback to update selected view based on segmented control ---
	@app.callback(
	Output("selected-view", "data"),
	Input("segmented", "value"),
	)
	def update_selected_view(seg_value):
	if seg_value == "filtered-downloads":
	return "one_year_rolling"
	return "all_downloads"

	# Run the app
	if __name__ == "__main__":
	app.run(debug=True)