File size: 4,274 Bytes
38fcf43 195a3cd 38fcf43 842e3d0 38fcf43 195a3cd 38fcf43 842e3d0 38fcf43 bb1ee8a 38fcf43 2b87392 38fcf43 e19490a 38fcf43 2b87392 38fcf43 2b87392 38fcf43 2b87392 38fcf43 2b87392 38fcf43 2b87392 38fcf43 e19490a 38fcf43 842e3d0 38fcf43 e19490a 38fcf43 2b87392 e19490a 2b87392 38fcf43 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 |
import gradio as gr
import httpx
from toolz import groupby
import plotly.express as px
import pandas as pd
from functools import lru_cache
choices = sorted(
[
"art",
"biology",
"code",
"distilabel",
"fiftyone",
"legal",
"medical",
"sentence-transformers",
"synthetic",
]
)
@lru_cache(maxsize=100)
def fetch_data(framework):
r = httpx.get(f"https://huggingface.co/api/datasets?filter={framework}")
data = r.json()
grouped = groupby(lambda x: x["author"], data)
grouped = dict(sorted(grouped.items(), key=lambda x: len(x[1]), reverse=True))
return data, grouped
def generate_dashboard(data, grouped, framework):
total_datasets = sum(len(v) for v in grouped.values())
dashboard = f"## Hugging Face datasets for {framework} \n\n"
dashboard += f"**Total number of datasets: {total_datasets}**\n\n"
dashboard += f"**Total number of authors: {len(grouped)}**\n\n"
dashboard += "### Datasets per Author\n\n"
for k, v in grouped.items():
dashboard += f"- **Author:** [{k}](https://huggingface.co/{k})\n"
dashboard += f" - **Number of datasets:** {len(v)}\n"
return dashboard
def plot_datasets_growth(data, framework, show_growth_rate=True):
df = pd.DataFrame(data)
df["createdAt"] = pd.to_datetime(df["createdAt"])
df["month"] = df["createdAt"].dt.to_period("M").astype(str)
# Exclude the current month
current_month = pd.Period.now("M").strftime("%Y-%m")
df = df[df["month"] < current_month]
df_counts = df.groupby("month").size().reset_index(name="count")
df_counts["cumulative_count"] = df_counts["count"].cumsum()
df_counts["growth_rate"] = df_counts["count"].pct_change()
fig = px.line(df_counts, x="month", y="cumulative_count", title="Dataset Growth")
fig.update_layout(
xaxis_title="Month",
yaxis_title="Cumulative Number of Datasets",
yaxis=dict(title=f"Cumulative Number of Datasets ({framework})"),
legend=dict(
title="", orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1
),
)
if show_growth_rate:
fig.update_layout(
yaxis2=dict(
title="Month-over-Month Growth Rate",
overlaying="y",
side="right",
tickformat=",.0%",
)
)
fig.add_scatter(
x=df_counts["month"],
y=df_counts["growth_rate"],
name="Growth Rate",
yaxis="y2",
)
fig.update_layout(
title={
"text": f"Dataset Growth for {framework} datasets",
"y": 0.95,
"x": 0.5,
"xanchor": "center",
"yanchor": "top",
},
title_font=dict(size=24),
annotations=[
dict(
x=0.5,
y=0.85,
xref="paper",
yref="paper",
text="Cumulative number of datasets"
+ (" and month-over-month growth rate" if show_growth_rate else ""),
showarrow=False,
font=dict(size=14),
)
],
)
return fig
def update_dashboard(framework, show_growth_rate=True):
data, grouped = fetch_data(framework)
dashboard = generate_dashboard(data, grouped, framework)
fig = plot_datasets_growth(data, framework, show_growth_rate)
return fig, dashboard
with gr.Blocks() as demo:
gr.Markdown("# Dataset frameworks/tags on the Hub")
gr.Markdown(
"This dashboard displays the number of datasets per author and the growth of datasets over time for a given framework/tag."
)
framework = gr.Dropdown(
choices=choices,
allow_custom_value=True,
label="Select a framework/tag",
)
show_growth_rate = gr.Checkbox(True, label="Show growth rate")
plot = gr.Plot(label="Growth of datasets over time")
markdown = gr.Markdown(label="summary")
framework.change(
update_dashboard, inputs=[framework, show_growth_rate], outputs=[plot, markdown]
)
show_growth_rate.change(
update_dashboard, inputs=[framework, show_growth_rate], outputs=[plot, markdown]
)
demo.launch()
|