Spaces:
Runtime error
Runtime error
File size: 5,977 Bytes
182c1d0 02b09bd 182c1d0 02b09bd 182c1d0 c66e8f9 182c1d0 02b09bd a7a5300 182c1d0 02b09bd 182c1d0 3533641 02b09bd 3533641 538d051 3533641 538d051 02b09bd 1e378d7 02b09bd 3533641 538d051 3533641 538d051 182c1d0 c66e8f9 1d67108 1e378d7 e4f3e8d 182c1d0 e4f3e8d 182c1d0 02b09bd 182c1d0 00ccb92 3533641 d7a9899 054abb2 3533641 c66e8f9 02b09bd 182c1d0 02b09bd 538d051 02b09bd 362494d 02b09bd 4653f13 3533641 182c1d0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 |
from huggingface_hub import list_models
from toolz import groupby
import gradio as gr
from tqdm.auto import tqdm
import pandas as pd
from cachetools import cached, TTLCache
@cached(TTLCache(maxsize=10, ttl=60 * 60 * 3))
def get_all_models():
models = list(
tqdm(
iter(list_models(cardData=True, limit=None, sort="downloads", direction=-1))
)
)
models = [model for model in models if model is not None]
return [
model for model in models if model.downloads > 1
] # filter out models with 0 downloads
def has_base_model_info(model):
try:
if card_data := model.cardData:
if base_model := card_data.get("base_model"):
if isinstance(base_model, str):
return True
except AttributeError:
return False
return False
grouped_by_has_base_model_info = groupby(has_base_model_info, get_all_models())
def produce_summary():
return f"""{len(grouped_by_has_base_model_info.get(True)):,} models have base model info.
{len(grouped_by_has_base_model_info.get(False)):,} models don't have base model info.
Currently {round(len(grouped_by_has_base_model_info.get(True))/len(get_all_models())*100,2)}% of models have base model info."""
models_with_base_model_info = grouped_by_has_base_model_info.get(True)
base_models = [
model.cardData.get("base_model") for model in models_with_base_model_info
]
df = pd.DataFrame(
pd.DataFrame({"base_model": base_models}).value_counts()
).reset_index()
df_with_org = df.copy(deep=True)
pipeline_tags = [x.pipeline_tag for x in models_with_base_model_info]
unique_pipeline_tags = list(
{x.pipeline_tag for x in models_with_base_model_info if x.pipeline_tag is not None}
)
def parse_org(hub_id):
parts = hub_id.split("/")
if len(parts) == 2:
return parts[0] if parts[0] != "." else None
else:
return "huggingface"
df_with_org["org"] = df_with_org["base_model"].apply(parse_org)
df_with_org = df_with_org.dropna(subset=["org"])
grouped_by_base_model = groupby(
lambda x: x.cardData.get("base_model"), models_with_base_model_info
)
all_base_models = df["base_model"].to_list()
def return_models_for_base_model(base_model):
models = grouped_by_base_model.get(base_model)
# sort models by downloads
models = sorted(models, key=lambda x: x.downloads, reverse=True)
results = ""
results += (
"## Models fine-tuned from"
f" [`{base_model}`](https://huggingface.co/{base_model}) \n\n"
)
results += f"`{base_model}` has {len(models)} children\n\n"
total_download_number = sum(model.downloads for model in models)
results += (
f"`{base_model}`'s children have been"
f" downloaded {total_download_number:,} times\n\n"
)
results += "### Children models \n\n"
for model in models:
url = f"https://huggingface.co/{model.modelId}"
results += (
f"- [{model.modelId}]({url}) | number of downloads {model.downloads}"
+ "\n\n"
)
return results
def return_base_model_popularity(pipeline=None):
df_with_pipeline_info = (
pd.DataFrame({"base_model": base_models, "pipeline": pipeline_tags})
.value_counts()
.reset_index()
)
if pipeline is not None:
df_with_pipeline_info = df_with_pipeline_info[
df_with_pipeline_info["pipeline"] == pipeline
]
keep_columns = ["base_model", "count"]
return df_with_pipeline_info[keep_columns].head(50)
def return_base_model_popularity_by_org(pipeline=None):
df_with_pipeline_info = pd.DataFrame(
{"base_model": base_models, "pipeline": pipeline_tags}
)
df_with_pipeline_info["org"] = df_with_pipeline_info["base_model"].apply(parse_org)
df_with_pipeline_info = df_with_pipeline_info.dropna(subset=["org"])
df_with_org = df_with_pipeline_info.copy(deep=True)
if pipeline is not None:
df_with_org = df_with_pipeline_info[df_with_org["pipeline"] == pipeline]
df_with_org = df_with_org.drop(columns=["pipeline"])
df_with_org = pd.DataFrame(df_with_org.value_counts())
return pd.DataFrame(
df_with_org.groupby("org")["count"]
.sum()
.sort_values(ascending=False)
.reset_index()
.head(50)
)
with gr.Blocks() as demo:
gr.Markdown(
"# Base model explorer: explore the lineage of models on the 🤗 Hub"
)
gr.Markdown(
"""When sharing models to the Hub it is possible to specify a base model in the model card, i.e. that your model is a fine-tuned version of [bert-base-cased](https://huggingface.co/bert-base-cased).
This Space allows you to find children models for a given base model and view the popularity of models for fine-tuning.
You can also optionally filter by task to see rankings for a particular machine learning task.
Don't forget to ❤ if you like this space 🤗"""
)
gr.Markdown(produce_summary())
gr.Markdown("## Find all models trained from a base model")
base_model = gr.Dropdown(all_base_models, label="Base Model")
results = gr.Markdown()
base_model.change(return_models_for_base_model, base_model, results)
gr.Markdown("## Base model rankings ")
dropdown = gr.Dropdown(
choices=unique_pipeline_tags,
value=None,
label="Filter rankings by task pipeline",
)
with gr.Accordion("Base model popularity ranking", open=False):
df_popularity = gr.DataFrame(return_base_model_popularity(None))
dropdown.change(return_base_model_popularity, dropdown, df_popularity)
with gr.Accordion("Base model popularity ranking by organization", open=False):
df_popularity_org = gr.DataFrame(return_base_model_popularity_by_org(None))
dropdown.change(
return_base_model_popularity_by_org, dropdown, df_popularity_org
)
demo.launch()
|