|
from huggingface_hub import HfApi |
|
import pandas as pd |
|
import os |
|
import streamlit as st |
|
import altair as alt |
|
import numpy as np |
|
import datetime |
|
from huggingface_hub import Repository |
|
|
|
from transformers.models.auto.configuration_auto import CONFIG_MAPPING_NAMES |
|
from transformers.models.auto.modeling_auto import ( |
|
MODEL_FOR_CTC_MAPPING_NAMES, |
|
MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES, |
|
MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES, |
|
MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES, |
|
MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING_NAMES, |
|
MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES, |
|
MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES, |
|
MODEL_FOR_CAUSAL_IMAGE_MODELING_MAPPING_NAMES, |
|
MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING_NAMES, |
|
) |
|
|
|
audio_models = list(MODEL_FOR_CTC_MAPPING_NAMES.keys()) + list(MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES.keys()) + list(MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES.keys()) |
|
|
|
vision_models = ["clip"] + list(MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES.keys()) + list(MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING_NAMES.keys()) + list(MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES.keys()) + list(MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES.keys()) + list(MODEL_FOR_CAUSAL_IMAGE_MODELING_MAPPING_NAMES.keys()) + list(MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING_NAMES.keys()) |
|
|
|
today = datetime.date.today() |
|
year, week, _ = today.isocalendar() |
|
|
|
DATASET_REPO_URL = ( |
|
"https://huggingface.co/datasets/patrickvonplaten/model-archs-downloads-space-data" |
|
) |
|
DATA_FILENAME = f"data_{week}_{year}.csv" |
|
DATA_FILE = os.path.join("data", DATA_FILENAME) |
|
|
|
HF_TOKEN = os.environ.get("HF_TOKEN") |
|
|
|
print("is none?", HF_TOKEN is None) |
|
|
|
|
|
def retrieve_model_stats(): |
|
hf_api = HfApi() |
|
all_stats = {} |
|
total_downloads = 0 |
|
|
|
for model_name in list(CONFIG_MAPPING_NAMES.keys()): |
|
if model_name in audio_models: |
|
modality = "audio" |
|
elif model_name in vision_models: |
|
modality = "vision" |
|
else: |
|
modality = "text" |
|
|
|
model_stats = { |
|
"num_downloads": 0, |
|
"%_of_all_downloads": 0, |
|
"num_models": 0, |
|
"download_per_model": 0, |
|
"modality": modality, |
|
} |
|
models = hf_api.list_models(filter=model_name) |
|
|
|
model_stats["num_models"] = len(models) |
|
model_stats["num_downloads"] = sum( |
|
[m.downloads for m in models if hasattr(m, "downloads")] |
|
) |
|
if len(models) > 0: |
|
model_stats["download_per_model"] = round( |
|
model_stats["num_downloads"] / len(models), 2 |
|
) |
|
total_downloads += model_stats["num_downloads"] |
|
|
|
|
|
all_stats[model_name] = model_stats |
|
|
|
for model_name in list(CONFIG_MAPPING_NAMES.keys()): |
|
all_stats[model_name]["%_of_all_downloads"] = ( |
|
round(all_stats[model_name]["num_downloads"] / total_downloads, 5) * 100 |
|
) |
|
downloads = all_stats[model_name]["num_downloads"] |
|
all_stats[model_name]["num_downloads"] = f"{downloads:,}" |
|
|
|
sorted_results = dict( |
|
reversed(sorted(all_stats.items(), key=lambda d: d[1]["%_of_all_downloads"])) |
|
) |
|
dataframe = pd.DataFrame.from_dict(sorted_results, orient="index") |
|
|
|
|
|
result = "model_names" + dataframe.to_csv() |
|
return result |
|
|
|
|
|
repo = Repository(local_dir="data", clone_from=DATASET_REPO_URL, use_auth_token=HF_TOKEN) |
|
|
|
if not os.path.isfile(DATA_FILE): |
|
st.title("You are the first this week!!! Please wait until the new data is generated and written") |
|
result = retrieve_model_stats() |
|
|
|
if not os.path.isfile(DATA_FILE): |
|
with open(DATA_FILE, "w") as f: |
|
f.write(result) |
|
|
|
commit_url = repo.push_to_hub() |
|
print(commit_url) |
|
|
|
with open(DATA_FILE, "r") as f: |
|
dataframe = pd.read_csv(DATA_FILE) |
|
|
|
dataframe[dataframe["modality"] == "audio"] |
|
int_downloads = np.array( |
|
[int(x.replace(",", "")) for x in dataframe["num_downloads"].values] |
|
) |
|
|
|
st.title(f"Stats for year {year} and week {week}") |
|
|
|
|
|
source = pd.DataFrame( |
|
{ |
|
"Number of total downloads": int_downloads[:20], |
|
"Model architecture name": dataframe["model_names"].values[:20], |
|
} |
|
) |
|
bar_chart = ( |
|
alt.Chart(source) |
|
.mark_bar() |
|
.encode( |
|
y="Number of total downloads", |
|
x=alt.X("Model architecture name", sort=None), |
|
) |
|
) |
|
st.title("Top 20 downloads last 30 days") |
|
st.altair_chart(bar_chart, use_container_width=True) |
|
|
|
|
|
source = pd.DataFrame( |
|
{ |
|
"Number of total downloads": int_downloads[-20:], |
|
"Model architecture name": dataframe["model_names"].values[-20:], |
|
} |
|
) |
|
bar_chart = ( |
|
alt.Chart(source) |
|
.mark_bar() |
|
.encode( |
|
y="Number of total downloads", |
|
x=alt.X("Model architecture name", sort=None), |
|
) |
|
) |
|
st.title("Bottom 20 downloads last 30 days") |
|
st.altair_chart(bar_chart, use_container_width=True) |
|
|
|
|
|
df_vision = dataframe[dataframe["modality"] == "vision"] |
|
vision_int_downloads = np.array( |
|
[int(x.replace(",", "")) for x in df_vision["num_downloads"].values] |
|
) |
|
source = pd.DataFrame( |
|
{ |
|
"Number of total downloads": vision_int_downloads, |
|
"Model architecture name": df_vision["model_names"].values, |
|
} |
|
) |
|
bar_chart = ( |
|
alt.Chart(source) |
|
.mark_bar() |
|
.encode( |
|
y="Number of total downloads", |
|
x=alt.X("Model architecture name", sort=None), |
|
) |
|
) |
|
st.title("Vision downloads last 30 days") |
|
st.altair_chart(bar_chart, use_container_width=True) |
|
|
|
|
|
df_audio = dataframe[dataframe["modality"] == "audio"] |
|
audio_int_downloads = np.array( |
|
[int(x.replace(",", "")) for x in df_audio["num_downloads"].values] |
|
) |
|
source = pd.DataFrame( |
|
{ |
|
"Number of total downloads": audio_int_downloads, |
|
"Model architecture name": df_audio["model_names"].values, |
|
} |
|
) |
|
bar_chart = ( |
|
alt.Chart(source) |
|
.mark_bar() |
|
.encode( |
|
y="Number of total downloads", |
|
x=alt.X("Model architecture name", sort=None), |
|
) |
|
) |
|
st.title("Audio downloads last 30 days") |
|
st.altair_chart(bar_chart, use_container_width=True) |
|
|
|
|
|
st.title("All stats last 30 days") |
|
st.table(dataframe) |
|
|
|
st.title("Vision stats last 30 days") |
|
st.table(dataframe[dataframe["modality"] == "vision"]) |
|
|
|
st.title("Audio stats last 30 days") |
|
st.table(dataframe[dataframe["modality"] == "audio"]) |
|
|