File size: 4,878 Bytes
4cb150c
0acccaf
75abe88
 
0acccaf
 
75abe88
20283f2
 
4cb150c
20283f2
 
 
 
 
 
 
 
 
 
 
 
568085d
20283f2
 
 
75abe88
 
 
 
20283f2
 
 
75abe88
 
 
 
 
 
 
 
 
0acccaf
20283f2
 
 
 
 
 
 
 
 
 
 
 
 
 
75abe88
 
 
20283f2
 
 
75abe88
20283f2
 
 
75abe88
 
 
 
 
0acccaf
20283f2
 
 
75abe88
 
 
20283f2
 
 
0acccaf
4cb150c
0acccaf
 
 
4cb150c
 
75abe88
4cb150c
75abe88
 
 
4cb150c
75abe88
 
 
4cb150c
75abe88
 
4cb150c
75abe88
0acccaf
 
20283f2
 
 
0acccaf
62d55e9
0acccaf
20283f2
 
 
 
 
 
 
 
 
 
 
 
 
0acccaf
62d55e9
0acccaf
 
 
20283f2
 
 
 
 
 
 
 
 
 
 
 
 
0acccaf
62d55e9
0acccaf
 
 
62d55e9
4cb150c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
from huggingface_hub import HfApi
import pandas as pd
import os
import streamlit as st
import altair as alt
import numpy as np
import datetime
from huggingface_hub import Repository

from transformers.models.auto.configuration_auto import CONFIG_MAPPING_NAMES
from transformers.models.auto.modeling_auto import (
    MODEL_FOR_CTC_MAPPING_NAMES,
    MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES,
    MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES,
    MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES,
    MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING_NAMES,
    MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING_NAMES,
    MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES,
    MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES,
    MODEL_FOR_CAUSAL_IMAGE_MODELING_MAPPING_NAMES,
    MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING_NAMES,
)

audio_models = MODEL_FOR_CTC_MAPPING_NAMES.keys() + MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES.keys() + MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES.keys()

vision_models = MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES.keys() + MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING_NAMES.keys() + MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING_NAMES.keys() + MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES.keys() + MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES.keys() + MODEL_FOR_CAUSAL_IMAGE_MODELING_MAPPING_NAMES.keys() + MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING_NAMES.keys()

today = datetime.date.today()
year, week, _ = today.isocalendar()

DATASET_REPO_URL = (
    "https://huggingface.co/datasets/patrickvonplaten/model-archs-downloads-space-data"
)
DATA_FILENAME = f"data_{week}_{year}.csv"
DATA_FILE = os.path.join("data", DATA_FILENAME)


def retrieve_model_stats():
    hf_api = HfApi()
    all_stats = {}
    total_downloads = 0

    for model_name in list(CONFIG_MAPPING_NAMES.keys()):
        if model_name in audio_models:
            modality = "audio"
        elif model_name in vision_models:
            modality = "vision"
        else:
            modality = "text"

        model_stats = {
            "num_downloads": 0,
            "%_of_all_downloads": 0,
            "num_models": 0,
            "download_per_model": 0,
            "modality": modality,
        }
        models = hf_api.list_models(filter=model_name)

        model_stats["num_models"] = len(models)
        model_stats["num_downloads"] = sum(
            [m.downloads for m in models if hasattr(m, "downloads")]
        )
        if len(models) > 0:
            model_stats["download_per_model"] = round(
                model_stats["num_downloads"] / len(models), 2
            )
        total_downloads += model_stats["num_downloads"]

        # save in overall dict
        all_stats[model_name] = model_stats

    for model_name in list(CONFIG_MAPPING_NAMES.keys()):
        all_stats[model_name]["%_of_all_downloads"] = (
            round(all_stats[model_name]["num_downloads"] / total_downloads, 5) * 100
        )  # noqa: E501
        downloads = all_stats[model_name]["num_downloads"]
        all_stats[model_name]["num_downloads"] = f"{downloads:,}"

    sorted_results = dict(
        reversed(sorted(all_stats.items(), key=lambda d: d[1]["%_of_all_downloads"]))
    )
    dataframe = pd.DataFrame.from_dict(sorted_results, orient="index")

    # give header to model names
    result = "model_names" + dataframe.to_csv()
    return result


repo = Repository(local_dir="data", clone_from=DATASET_REPO_URL)

if not os.path.isfile(DATA_FILE):
    print("Create datafile...")
    result = retrieve_model_stats()

    if not os.path.isfile(DATA_FILE):
        with open(DATA_FILE, "w") as f:
            f.write(result)

        commit_url = repo.push_to_hub()
        print(commit_url)

with open(DATA_FILE, "r") as f:
    dataframe = pd.read_csv(DATA_FILE)

int_downloads = np.array(
    [int(x.replace(",", "")) for x in dataframe["num_downloads"].values]
)

st.title(f"Transformers stats for year {year} and week {week}")
# print top 20 downloads
source = pd.DataFrame(
    {
        "Number of total downloads": int_downloads[:20],
        "Model architecture name": dataframe["model_names"].values[:20],
    }
)
bar_chart = (
    alt.Chart(source)
    .mark_bar()
    .encode(
        y="Number of total downloads",
        x=alt.X("Model architecture name", sort=None),
    )
)
st.title("Top 20 downloads last 30 days")
st.altair_chart(bar_chart, use_container_width=True)

# print bottom 20 downloads
source = pd.DataFrame(
    {
        "Number of total downloads": int_downloads[-20:],
        "Model architecture name": dataframe["model_names"].values[-20:],
    }
)
bar_chart = (
    alt.Chart(source)
    .mark_bar()
    .encode(
        y="Number of total downloads",
        x=alt.X("Model architecture name", sort=None),
    )
)
st.title("Bottom 20 downloads last 30 days")
st.altair_chart(bar_chart, use_container_width=True)

# print all stats
st.title("All stats last 30 days")
st.table(dataframe)