emsesc's picture
delete filtered_df
eb34850
raw
history blame
39.6 kB
from dash import Dash, html, dcc, Input, Output, State
from dash import Dash, html, dcc, Input, Output, State
import pandas as pd
import dash_mantine_components as dmc
import duckdb
import time
from graphs.leaderboard import (
button_style,
get_top_n_leaderboard,
render_table_content,
)
from dash_iconify import DashIconify
# Initialize the app
app = Dash()
server = app.server
def load_parquet_to_duckdb(con, parquet_url, view_name):
"""
Loads a parquet file from a remote URL into DuckDB as a view.
Returns (start_dt, end_dt) for the 'time' column.
"""
# Install and load httpfs extension for remote file access
con.execute("INSTALL httpfs;")
con.execute("LOAD httpfs;")
# Create a view that references the remote parquet file
con.execute(f"""
CREATE OR REPLACE VIEW {view_name} AS
SELECT * FROM read_parquet('{parquet_url}')
""")
# Get time range for slider
time_range = con.execute(
f"SELECT MIN(time) as min_time, MAX(time) as max_time FROM {view_name}"
).fetchdf()
start_dt = pd.to_datetime(time_range["min_time"].iloc[0])
end_dt = pd.to_datetime(time_range["max_time"].iloc[0])
return start_dt, end_dt
# DuckDB connection (global)
con = duckdb.connect(database=":memory:", read_only=False)
# Load parquet files from Hugging Face using DuckDB
HF_DATASET_ID = "emsesc/open_model_evolution_data"
hf_parquet_url_1 = "https://huggingface.co/datasets/emsesc/open_model_evolution_data/resolve/main/all_downloads_with_annotations.parquet"
hf_parquet_url_2 = "https://huggingface.co/datasets/emsesc/open_model_evolution_data/resolve/main/one_year_rolling.parquet"
print(f"Attempting to connect to dataset from Hugging Face Hub: {HF_DATASET_ID}")
try:
overall_start_time = time.time()
# Load both parquet files as views
start_dt, end_dt = load_parquet_to_duckdb(con, hf_parquet_url_1, "all_downloads")
# Example: load a second parquet file as another view
start_dt2, end_dt2 = load_parquet_to_duckdb(con, hf_parquet_url_2, "one_year_rolling")
msg = (
f"Successfully connected to datasets in {time.time() - overall_start_time:.2f}s."
)
print(msg)
except Exception as e:
err_msg = f"Failed to load dataset(s). Error: {e}"
print(err_msg)
raise
# Create a dcc slider for time range selection by year (readable marks)
start_ts = int(start_dt.timestamp())
end_ts = int(end_dt.timestamp())
def ordinal(n):
# Helper to get ordinal suffix for a day
if 10 <= n % 100 <= 20:
suffix = 'th'
else:
suffix = {1: 'st', 2: 'nd', 3: 'rd'}.get(n % 10, 'th')
return f"{n}{suffix}"
def format_date(dt):
# Format date as "Oct 8th, 2025"
return dt.strftime("%b") + f" {ordinal(dt.day)}, {dt.year}"
marks = []
# Add start label (e.g. "Jan 2020")
marks.append({"value": start_ts, "label": start_dt.strftime("%b %Y")})
# Add yearly marks between start and end (e.g. "2021", "2022")
for yr in range(start_dt.year, end_dt.year + 1):
yr_ts = int(pd.Timestamp(year=yr, month=1, day=1).timestamp())
start_yr = int(pd.Timestamp(year=start_dt.year, month=1, day=1).timestamp())
if yr_ts != start_yr and yr_ts != end_ts:
marks.append({"value": yr_ts, "label": str(yr)})
# Add end label (e.g. "Dec 2024")
marks.append({"value": end_ts, "label": end_dt.strftime("%b %Y")})
def get_thumb_labels(values):
# Returns formatted labels for both thumbs
distance = abs(values[1] - values[0])
close = distance < 4 * 30 * 86400 # 4 months
label_style = {
"background": "#fff",
"color": "#082030",
"fontWeight": "bold",
"fontSize": "13px",
"borderRadius": "8px",
"padding": "2px 8px",
"boxShadow": "0 1px 4px rgba(8,32,48,0.10)",
"position": "absolute",
"left": "50%",
"transform": "translateX(-50%)",
"whiteSpace": "nowrap",
"zIndex": 100,
}
if close:
# Move first label above, second label below (closer to slider)
style_top_1 = label_style.copy()
style_top_1["top"] = "-38px"
style_top_2 = label_style.copy()
style_top_2["top"] = "14px"
return [
html.Div(
format_date(pd.to_datetime(values[0], unit="s")),
style=style_top_1,
),
html.Div(
format_date(pd.to_datetime(values[1], unit="s")),
style=style_top_2,
),
]
else:
# Both labels below the slider (closer to slider)
style_top_1 = label_style.copy()
style_top_1["top"] = "14px"
style_top_2 = label_style.copy()
style_top_2["top"] = "14px"
return [
html.Div(
format_date(pd.to_datetime(values[0], unit="s")),
style=style_top_1,
),
html.Div(
format_date(pd.to_datetime(values[1], unit="s")),
style=style_top_2,
),
]
# Create a dcc slider for time range selection by year
time_slider = dmc.RangeSlider(
id="time-slider",
min=start_ts,
max=end_ts,
value=[
start_ts,
end_ts,
],
step=24 * 60 * 60,
color="#AC482A",
size="md",
radius="xl",
marks=marks,
style={"width": "95%", "paddingLeft": "60px"}, # updated paddingLeft
label=None,
showLabelOnHover=False,
labelTransitionProps={"transition": "fade", "duration": 150},
thumbChildren=get_thumb_labels([start_ts, end_ts]),
)
# Add a dcc.Store to hold the selected view (all_downloads or one_year_rolling)
app.layout = dmc.MantineProvider(
theme={
"colorScheme": "light",
"primaryColor": "blue",
"fontFamily": "Inter, sans-serif",
},
children=[
dcc.Store(id="selected-view", data="all_downloads"),
dcc.Store(id="derived-author-toggle", data=True), # Store for toggle state
html.Div(
[
# Header
html.Div(
[
html.Div(
[
html.Div(
[
html.Div(
children="Economies of Open Intelligence",
style={
"fontSize": 22,
"fontWeight": "700",
"lineHeight": "1.1",
},
),
html.Div(
children="Tracing Power & Participation in the Model Ecosystem",
style={
"fontSize": 13,
"marginTop": 6,
"opacity": 0.9,
},
),
],
style={
"display": "flex",
"flexDirection": "column",
"justifyContent": "center",
},
),
html.Div(
[
html.A(
children=[
html.Img(
src="assets/images/dpi.svg",
style={
"height": "28px",
"verticalAlign": "middle",
"paddingRight": "8px",
},
),
"Data Provenance Initiative",
],
href="https://www.dataprovenance.org/",
target="_blank",
className="no-bg-link header-link",
style={
"display": "inline-block",
"padding": "6px 14px",
"fontSize": 13,
"color": "#FFFFFF", # white on dark header
# background removed so CSS controls it
"borderRadius": "18px",
"fontWeight": "700",
"textDecoration": "none",
"marginRight": "12px",
},
),
html.A(
children=[
html.Img(
src="assets/images/hf.svg",
style={
"height": "30px",
"verticalAlign": "middle",
},
),
html.Span(
"Hugging Face",
className="hf-brand-text",
),
],
href="https://huggingface.co/",
target="_blank",
className="no-bg-link header-link",
style={
"display": "inline-flex",
"padding": "6px 14px",
"alignItems": "center",
"color": "#FFFFFF",
"borderRadius": "18px",
"textDecoration": "none",
"marginRight": "12px",
},
),
html.A(
children=[
html.Span(
"Read the paper",
className="paper-text",
),
],
href="https://www.google.com/",
target="_blank",
className="no-bg-link header-link paper-link",
style={
"display": "inline-flex",
"alignItems": "center",
"padding": "6px 12px", # decreased size
"fontSize": 14, # smaller text
"margin": "0 auto",
"backgroundColor": "#AC482A",
"color": "#FFFFFF",
"borderRadius": "5px",
"textDecoration": "none",
"fontWeight": "700",
},
),
],
style={"display": "flex", "alignItems": "center"},
),
],
style={
"marginLeft": "50px",
"marginRight": "50px",
"display": "flex",
"justifyContent": "space-between",
"alignItems": "center",
"padding": "18px 24px",
"gap": "24px",
},
),
],
style={
"backgroundColor": "#082030",
"color": "white",
"width": "100%",
},
),
# Intro / description below header (kept but styled to match layout)
# Title
html.Div(
children="The Open Model Leaderboard",
style={
"fontSize": 40,
"fontWeight": "700",
"textAlign": "center",
"marginTop": 20,
"marginBottom": 20,
},
),
html.Div(
children="This leaderboard assesses concentrations of power in the open model ecosystem across three hierarchies: countries, developers, and models. Explore how downloads are distributed among these groups and identify key players shaping the open model ecosystem on Hugging Face.",
style={
"fontSize": 14,
"marginTop": 18,
"marginBottom": 12,
"marginLeft": 100,
"marginRight": 100,
"textAlign": "center",
},
),
# Main content (filters + tabs)
html.Div(
children=[
html.Div(
[
html.Div(
"Select Download View",
style={
"fontWeight": "700",
"marginBottom": 8,
"fontSize": 14,
},
),
dmc.SegmentedControl(
id="segmented",
value="all-downloads",
color="#AC482A",
transitionDuration=200,
data=[
{
"value": "all-downloads",
"label": "All Downloads",
},
{
"value": "filtered-downloads",
"label": "Filtered Downloads",
},
],
mb=10,
),
html.Div(
"Choose whether to view all downloads or only those within one year of the model's creation date.",
style={
"fontSize": 13,
"color": "#555",
"marginBottom": "12px",
},
),
# New segmented control below the first one
html.Div(
[
html.Div(
"Select Author Type",
style={
"fontWeight": "700",
"marginBottom": 8,
"fontSize": 14,
},
),
dmc.Switch(
id="derived-author-switch", # <-- add id
color="#AC482A",
label="Derived Authors",
checked=True,
mb=10,
),
html.Div(
"Toggle between viewing downloads by original authors or derived authors (those who forked or adapted models).",
style={
"fontSize": 13,
"color": "#555",
"marginBottom": "12px",
},
),
],
style={"marginTop": "10px"},
),
html.Span(
id="global-toggle-status",
style={
"marginLeft": "8px",
"display": "inline-block",
"marginTop": 6,
},
),
],
style={"flex": 1, "minWidth": "220px"},
),
html.Div(
[
html.Div(
"Select Time Range",
style={
"fontWeight": "700",
"marginBottom": 8,
"fontSize": 14,
},
),
time_slider,
html.Div(
"Adjust the time range to filter leaderboard results by model download times.",
style={
"fontSize": 13,
"color": "#555",
"marginTop": "32px", # increased from 24px
},
),
# Tip section
html.Div(
[
html.Div(
[
DashIconify(
icon="mdi:lightbulb-on-outline",
width=20,
height=20,
style={"marginRight": "8px", "color": "#082030"},
),
html.Span("Tip"),
],
style={
"fontWeight": "700",
"fontSize": 15,
"marginBottom": "6px",
"color": "#082030",
"display": "flex",
"alignItems": "center",
},
),
html.Div(
[
"Try switching between ",
html.Span("All Downloads", style={"fontWeight": "600", "color": "#AC482A"}),
" and ",
html.Span("Filtered Downloads", style={"fontWeight": "600", "color": "#AC482A"}),
" to compare overall popularity versus early interest after model release. ",
"You can also toggle ON ",
html.Span("Derived Authors", style={"fontWeight": "600", "color": "#AC482A"}),
" to see how derivative works contribute to developer influence.",
],
style={
"fontSize": 13,
"color": "#082030",
"lineHeight": "1.6",
},
),
],
style={
"backgroundColor": "#F5ECE6",
"borderRadius": "14px",
"padding": "18px 20px",
"marginTop": "28px",
"boxShadow": "0 1px 4px rgba(8,32,48,0.04)",
"border": "1px solid #f0e3d6",
},
),
],
style={
"flex": 2,
"minWidth": "320px",
"display": "flex",
"flexDirection": "column",
"justifyContent": "center",
"height": "100%",
},
),
],
style={
"display": "flex",
"gap": "24px",
"padding": "32px",
"alignItems": "flex-start",
"marginLeft": "100px",
"marginRight": "100px",
"backgroundColor": "#FFFBF9",
"borderRadius": "18px",
},
),
html.Div(
[
dcc.Tabs(
id="leaderboard-tabs",
value="Countries",
children=[
dcc.Tab(
label="Countries",
value="Countries",
style={
"backgroundColor": "transparent",
"border": "none",
"padding": "10px 18px",
"color": "#6B7280",
"fontWeight": "500",
},
selected_style={
"backgroundColor": "transparent",
"border": "none",
"padding": "10px 18px",
"fontWeight": "700",
"borderBottom": "3px solid #082030",
},
children=[
html.Div(
children="The model leaderboard assesses concentrations of power across three hierarchies: countries, developers, and models. Explore how downloads are distributed among these groups and identify key players shaping the open model ecosystem on Hugging Face.",
style={
"fontSize": 14,
"marginTop": 18,
"marginBottom": 12,
"textAlign": "left",
},
),
dcc.Loading(
id="loading-countries",
type="circle",
color="#AC482A",
children=html.Div(id="top_countries-table")
),
html.Button(
id="top_countries-toggle",
children="▼ Show Top 50",
n_clicks=0,
style={**button_style, "border": "none"},
),
],
),
dcc.Tab(
label="Developers",
value="Developers",
style={
"backgroundColor": "transparent",
"border": "none",
"padding": "10px 18px",
"color": "#6B7280",
"fontWeight": "500",
},
selected_style={
"backgroundColor": "transparent",
"border": "none",
"padding": "10px 18px",
"fontWeight": "700",
"borderBottom": "3px solid #082030",
},
children=[
html.Div(
children="The model leaderboard assesses concentrations of power across three hierarchies: countries, developers, and models. Explore how downloads are distributed among these groups and identify key players shaping the open model ecosystem on Hugging Face.",
style={
"fontSize": 14,
"marginTop": 18,
"marginBottom": 12,
"textAlign": "left",
},
),
dcc.Loading(
id="loading-developers",
type="circle",
color="#AC482A",
children=html.Div(id="top_developers-table")
),
html.Button(
id="top_developers-toggle",
children="▼ Show Top 50",
n_clicks=0,
style={**button_style, "border": "none"},
),
],
),
dcc.Tab(
label="Models",
value="Models",
style={
"backgroundColor": "transparent",
"border": "none",
"padding": "10px 18px",
"color": "#6B7280",
"fontWeight": "500",
},
selected_style={
"backgroundColor": "transparent",
"border": "none",
"padding": "10px 18px",
"fontWeight": "700",
"borderBottom": "3px solid #082030",
},
children=[
html.Div(
children="The model leaderboard assesses concentrations of power across three hierarchies: countries, developers, and models. Explore how downloads are distributed among these groups and identify key players shaping the open model ecosystem on Hugging Face.",
style={
"fontSize": 14,
"marginTop": 18,
"marginBottom": 12,
"textAlign": "left",
},
),
dcc.Loading(
id="loading-models",
type="circle",
color="#AC482A",
children=html.Div(id="top_models-table")
),
html.Button(
id="top_models-toggle",
children="▼ Show Top 50",
n_clicks=0,
style={**button_style, "border": "none"},
),
],
),
],
),
],
style={
"borderRadius": "18px",
"padding": "32px",
"marginTop": "12px",
"marginBottom": "12px", # reduced from 64px
"marginLeft": "50px",
"marginRight": "50px",
},
),
],
style={
"fontFamily": "Inter",
"backgroundColor": "#ffffff",
"minHeight": "100vh",
},
)
],
)
# Callbacks for interactivity
# -- helper utilities to consolidate duplicated callback logic --
def _get_filtered_top_n_from_duckdb(slider_value, group_col, top_n, view="all_downloads"):
"""
Query DuckDB directly to get top N entries with metadata
This minimizes data transfer by doing aggregation in DuckDB
"""
# Build time filter clause
time_clause = ""
if slider_value and len(slider_value) == 2:
start = pd.to_datetime(slider_value[0], unit="s")
end = pd.to_datetime(slider_value[1], unit="s")
time_clause = f"WHERE time >= '{start}' AND time <= '{end}'"
# Build the aggregation query to get top N with all needed metadata
# This query groups by the target column and aggregates downloads
# while collecting all metadata we need for chips
query = f"""
WITH base_data AS (
SELECT
{group_col},
CASE
WHEN org_country_single IN ('HF', 'United States of America') THEN 'United States of America'
WHEN org_country_single IN ('International', 'Online') THEN 'International/Online'
ELSE org_country_single
END AS org_country_single,
author,
derived_author,
merged_country_groups_single,
merged_modality,
downloads,
model
FROM {view}
{time_clause}
),
-- Compute the total downloads for all rows in the time range
total_downloads_cte AS (
SELECT SUM(downloads) AS total_downloads_all
FROM base_data
),
-- Compute per-group totals and their percentage of all downloads
top_items AS (
SELECT
b.{group_col} AS name,
SUM(b.downloads) AS total_downloads,
ROUND(SUM(b.downloads) * 100.0 / t.total_downloads_all, 2) AS percent_of_total,
-- Pick first non-null metadata values for reference
ANY_VALUE(b.org_country_single) AS org_country_single,
ANY_VALUE(b.author) AS author,
ANY_VALUE(b.derived_author) AS derived_author,
ANY_VALUE(b.merged_country_groups_single) AS merged_country_groups_single,
ANY_VALUE(b.merged_modality) AS merged_modality,
ANY_VALUE(b.model) AS model
FROM base_data b
CROSS JOIN total_downloads_cte t
GROUP BY b.{group_col}, t.total_downloads_all
)
SELECT *
FROM top_items
ORDER BY total_downloads DESC
LIMIT {top_n};
"""
return con.execute(query).fetchdf()
def _leaderboard_callback_logic(
n_clicks,
slider_value,
current_label,
group_col,
filename,
default_label="▼ Show Top 50",
chip_color="#F0F9FF",
view="all_downloads",
derived_author_toggle=True,
):
# Normalize label on first load
if current_label is None:
current_label = default_label
# Determine top_n and next label
if n_clicks == 0:
top_n = 10
new_label = current_label
elif "Show Top 50" in current_label:
top_n, new_label = 50, "▼ Show Top 100"
elif "Show Top 100" in current_label:
top_n, new_label = 100, "▲ Show Less"
else:
top_n, new_label = 10, "▼ Show Top 50"
# Get filtered and aggregated data directly from DuckDB
df_filtered = _get_filtered_top_n_from_duckdb(slider_value, group_col, top_n, view=view)
# Process the already-filtered data - pass derived_author_toggle
df, download_df = get_top_n_leaderboard(df_filtered, group_col, top_n, derived_author_toggle=derived_author_toggle)
return render_table_content(
df, download_df, chip_color=chip_color, filename=filename
), new_label
# -- end helpers --
# --- Callback to store derived author toggle state ---
@app.callback(
Output("derived-author-toggle", "data"),
Input("derived-author-switch", "checked"),
)
def update_derived_author_toggle(checked):
return checked
# Callbacks for interactivity (modularized)
@app.callback(
Output("top_countries-table", "children"),
Output("top_countries-toggle", "children"),
Input("top_countries-toggle", "n_clicks"),
Input("time-slider", "value"),
Input("selected-view", "data"),
Input("derived-author-toggle", "data"),
State("top_countries-toggle", "children"),
)
def update_top_countries(n_clicks, slider_value, selected_view, derived_author_toggle, current_label):
return _leaderboard_callback_logic(
n_clicks,
slider_value,
current_label,
group_col="org_country_single",
filename="top_countries",
default_label="▼ Show Top 50",
chip_color="#F0F9FF",
view=selected_view,
derived_author_toggle=derived_author_toggle,
)
@app.callback(
Output("top_developers-table", "children"),
Output("top_developers-toggle", "children"),
Input("top_developers-toggle", "n_clicks"),
Input("time-slider", "value"),
Input("selected-view", "data"),
Input("derived-author-toggle", "data"),
State("top_developers-toggle", "children"),
)
def update_top_developers(n_clicks, slider_value, selected_view, derived_author_toggle, current_label):
# Use derived_author if toggle is True, else author
group_col = "derived_author" if derived_author_toggle else "author"
return _leaderboard_callback_logic(
n_clicks,
slider_value,
current_label,
group_col=group_col,
filename="top_developers",
default_label="▼ Show Top 50",
chip_color="#F0F9FF",
view=selected_view,
derived_author_toggle=derived_author_toggle,
)
@app.callback(
Output("top_models-table", "children"),
Output("top_models-toggle", "children"),
Input("top_models-toggle", "n_clicks"),
Input("time-slider", "value"),
Input("selected-view", "data"),
Input("derived-author-toggle", "data"),
State("top_models-toggle", "children"),
)
def update_top_models(n_clicks, slider_value, selected_view, derived_author_toggle, current_label):
return _leaderboard_callback_logic(
n_clicks,
slider_value,
current_label,
group_col="model",
filename="top_models",
default_label="▼ Show More",
chip_color="#F0F9FF",
view=selected_view,
derived_author_toggle=derived_author_toggle,
)
@app.callback(
Output("time-slider", "thumbChildren"),
Input("time-slider", "value"),
)
def update_thumb_labels(values):
return get_thumb_labels(values)
# --- Add callback to update selected view based on segmented control ---
@app.callback(
Output("selected-view", "data"),
Input("segmented", "value"),
)
def update_selected_view(seg_value):
if seg_value == "filtered-downloads":
return "one_year_rolling"
return "all_downloads"
# Run the app
if __name__ == "__main__":
app.run(debug=True)