from huggingface_hub import HfApi import pandas as pd import os import streamlit as st import altair as alt import numpy as np import datetime from transformers.models.auto.configuration_auto import CONFIG_MAPPING_NAMES from huggingface_hub import Repository today = datetime.date.today() year, week, _ = today.isocalendar() DATASET_REPO_URL = "https://huggingface.co/datasets/patrickvonplaten/model-archs-downloads-space-data" DATA_FILENAME = f"data_{week}_{year}.csv" DATA_FILE = os.path.join("data", DATA_FILENAME) def retrieve_model_stats(): hf_api = HfApi() all_stats = {} total_downloads = 0 for model_name in list(CONFIG_MAPPING_NAMES.keys()): model_stats = {"num_downloads": 0, "%_of_all_downloads": 0, "num_models": 0, "download_per_model": 0} models = hf_api.list_models(filter=model_name) model_stats["num_models"] = len(models) model_stats["num_downloads"] = sum([m.downloads for m in models if hasattr(m, "downloads")]) if len(models) > 0: model_stats["download_per_model"] = round(model_stats["num_downloads"] / len(models), 2) total_downloads += model_stats["num_downloads"] # save in overall dict all_stats[model_name] = model_stats for model_name in list(CONFIG_MAPPING_NAMES.keys()): all_stats[model_name]["%_of_all_downloads"] = round(all_stats[model_name]["num_downloads"] / total_downloads, 5) * 100 # noqa: E501 downloads = all_stats[model_name]["num_downloads"] all_stats[model_name]["num_downloads"] = f"{downloads:,}" sorted_results = dict(reversed(sorted(all_stats.items(), key=lambda d: d[1]["%_of_all_downloads"]))) dataframe = pd.DataFrame.from_dict(sorted_results, orient="index") # give header to model names result = "model_names" + dataframe.to_csv() return result repo = Repository(local_dir="data", clone_from=DATASET_REPO_URL) if not os.path.isfile(DATA_FILE): print("Create datafile...") result = retrieve_model_stats() if not os.path.isfile(DATA_FILE): with open(DATA_FILE, "w") as f: f.write(result) commit_url = repo.push_to_hub() print(commit_url) with open(DATA_FILE, "r") as f: dataframe = pd.read_csv(DATA_FILE) int_downloads = np.array([int(x.replace(",", "")) for x in dataframe["num_downloads"].values]) st.title(f"Transformers stats for year {year} and week {week}") # print top 20 downloads source = pd.DataFrame({ 'Number of total downloads': int_downloads[:20], 'Model architecture name': dataframe["model_names"].values[:20], }) bar_chart = alt.Chart(source).mark_bar().encode( y="Number of total downloads", x=alt.X("Model architecture name", sort=None), ) st.title("Top 20 downloads last 30 days") st.altair_chart(bar_chart, use_container_width=True) # print bottom 20 downloads source = pd.DataFrame({ 'Number of total downloads': int_downloads[-20:], 'Model architecture name': dataframe["model_names"].values[-20:], }) bar_chart = alt.Chart(source).mark_bar().encode( y="Number of total downloads", x=alt.X("Model architecture name", sort=None), ) st.title("Bottom 20 downloads last 30 days") st.altair_chart(bar_chart, use_container_width=True) # print all stats st.title("All stats last 30 days") st.table(dataframe)