osanseviero
commited on
Commit
•
78f7e42
1
Parent(s):
6c21ae3
Release v2
Browse files- __pycache__/language.cpython-38.pyc +0 -0
- __pycache__/pipelines.cpython-38.pyc +0 -0
- __pycache__/utils.cpython-38.pyc +0 -0
- changelog.md +16 -1
- language.py +52 -0
- models.py +227 -214
- pipelines.py +45 -0
- utils.py +4 -1
__pycache__/language.cpython-38.pyc
ADDED
Binary file (1.6 kB). View file
|
|
__pycache__/pipelines.cpython-38.pyc
ADDED
Binary file (1.5 kB). View file
|
|
__pycache__/utils.cpython-38.pyc
ADDED
Binary file (2.59 kB). View file
|
|
changelog.md
CHANGED
@@ -1,11 +1,26 @@
|
|
1 |
Changelog
|
2 |
|
|
|
|
|
|
|
3 |
v0.2 - Oct 24
|
4 |
- Languages
|
5 |
- Allow filtering for modality
|
6 |
-
- Show new languages for the diff
|
7 |
- Show rate of change in languages
|
8 |
- Also include multilingual tag as multilingual for model selection in languages
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
|
10 |
v0.1
|
11 |
- Allow pick comparison version
|
|
|
1 |
Changelog
|
2 |
|
3 |
+
Planned
|
4 |
+
- Allow filtering just for the new models (no way to get this atm)
|
5 |
+
|
6 |
v0.2 - Oct 24
|
7 |
- Languages
|
8 |
- Allow filtering for modality
|
9 |
+
- Show new and removed languages for the diff
|
10 |
- Show rate of change in languages
|
11 |
- Also include multilingual tag as multilingual for model selection in languages
|
12 |
+
- Spotted bug: False as a row in the dataset. To look into it
|
13 |
+
- License
|
14 |
+
- Add rate of change for top metrics
|
15 |
+
- Show lost and new licenses
|
16 |
+
- Pipelines
|
17 |
+
- Add rate of change for all metrics
|
18 |
+
- Fix bug that did not show new tags
|
19 |
+
- Add info per modality
|
20 |
+
- See new tags
|
21 |
+
- Pipeline breakdown by modality
|
22 |
+
- Discussions and Libraries
|
23 |
+
- Add rate of change for metrics
|
24 |
|
25 |
v0.1
|
26 |
- Allow pick comparison version
|
language.py
CHANGED
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from ast import literal_eval
|
2 |
+
|
3 |
+
def make_lang_list(row):
|
4 |
+
languages = row["languages"]
|
5 |
+
if languages == "none":
|
6 |
+
return []
|
7 |
+
return literal_eval(languages)
|
8 |
+
|
9 |
+
def language_count(row):
|
10 |
+
return len(row["languages"])
|
11 |
+
|
12 |
+
def process_for_lang(data, modality):
|
13 |
+
# Filter by modality
|
14 |
+
if modality == "NLP":
|
15 |
+
data = data[data["modality"] == "nlp"]
|
16 |
+
elif modality == "Audio":
|
17 |
+
data = data[data["modality"] == "audio"]
|
18 |
+
elif modality == "Multimodal":
|
19 |
+
data = data[data["modality"] == "multimodal"]
|
20 |
+
|
21 |
+
# Remove rows without languages
|
22 |
+
data.loc[data.languages == "False", 'languages'] = None
|
23 |
+
data.loc[data.languages == {}, 'languages'] = None
|
24 |
+
|
25 |
+
# Count of rows that have no languages
|
26 |
+
no_lang_count = data["languages"].isna().sum()
|
27 |
+
|
28 |
+
# As the languages column might have multiple languages,
|
29 |
+
# we need to convert it to a list. We then count the number of languages.
|
30 |
+
data["languages"] = data["languages"].fillna('none')
|
31 |
+
data["languages"] = data.apply(make_lang_list, axis=1)
|
32 |
+
data["language_count"] = data.apply(language_count, axis=1)
|
33 |
+
|
34 |
+
# Just keep the models with at least one language
|
35 |
+
models_with_langs = data[data["language_count"] > 0]
|
36 |
+
langs = models_with_langs["languages"].explode()
|
37 |
+
langs = langs[langs != {}]
|
38 |
+
total_langs = len(langs.unique())
|
39 |
+
|
40 |
+
data['multilingual'] = data.apply(lambda x: int("multilingual" in x['languages']), axis=1)
|
41 |
+
|
42 |
+
return data, no_lang_count, total_langs, langs.unique()
|
43 |
+
|
44 |
+
def filter_multilinguality(data, linguality):
|
45 |
+
if linguality == "Just Multilingual":
|
46 |
+
multilingual_tag = data["multilingual"] == 1
|
47 |
+
multiple_lang_tags = data["language_count"] > 1
|
48 |
+
return data[multilingual_tag | multiple_lang_tags]
|
49 |
+
elif linguality == "Three or more languages":
|
50 |
+
return data[data["language_count"] >= 3]
|
51 |
+
else:
|
52 |
+
return data
|
models.py
CHANGED
@@ -4,7 +4,9 @@ from ast import literal_eval
|
|
4 |
import altair as alt
|
5 |
import matplotlib.pyplot as plt
|
6 |
|
7 |
-
from utils import process_dataset, eval_tags
|
|
|
|
|
8 |
|
9 |
def main():
|
10 |
# Pick revision at top
|
@@ -26,16 +28,6 @@ def main():
|
|
26 |
supported_revisions,
|
27 |
index=2)
|
28 |
|
29 |
-
def change_pct(old, new):
|
30 |
-
return round(100* (new - old) / new, 3)
|
31 |
-
|
32 |
-
def change_and_delta(old_old, old, new):
|
33 |
-
curr_change = change_pct(old, new)
|
34 |
-
prev_change = change_pct(old_old, old)
|
35 |
-
delta = f"{curr_change-prev_change}%"
|
36 |
-
curr_change = f"{curr_change}%"
|
37 |
-
return curr_change, delta
|
38 |
-
|
39 |
# Process dataset
|
40 |
old_old_data = process_dataset(base_old)
|
41 |
old_data = process_dataset(base)
|
@@ -63,44 +55,11 @@ def main():
|
|
63 |
|
64 |
tab = st.selectbox(
|
65 |
'Topic of interest',
|
66 |
-
["Language",
|
67 |
|
68 |
if tab == "Language":
|
69 |
st.header("Languages info")
|
70 |
|
71 |
-
def make_list(row):
|
72 |
-
languages = row["languages"]
|
73 |
-
if languages == "none":
|
74 |
-
return []
|
75 |
-
return literal_eval(languages)
|
76 |
-
|
77 |
-
def language_count(row):
|
78 |
-
return len(row["languages"])
|
79 |
-
|
80 |
-
def process_for_lang(data):
|
81 |
-
# Remove rows without languages
|
82 |
-
data.loc[data.languages == "False", 'languages'] = None
|
83 |
-
data.loc[data.languages == {}, 'languages'] = None
|
84 |
-
|
85 |
-
# Count of rows that have no languages
|
86 |
-
no_lang_count = data["languages"].isna().sum()
|
87 |
-
|
88 |
-
# As the languages column might have multiple languages,
|
89 |
-
# we need to convert it to a list. We then count the number of languages.
|
90 |
-
data["languages"] = data["languages"].fillna('none')
|
91 |
-
data["languages"] = data.apply(make_list, axis=1)
|
92 |
-
data["language_count"] = data.apply(language_count, axis=1)
|
93 |
-
|
94 |
-
# Just keep the models with at least one language
|
95 |
-
models_with_langs = data[data["language_count"] > 0]
|
96 |
-
langs = models_with_langs["languages"].explode()
|
97 |
-
langs = langs[langs != {}]
|
98 |
-
total_langs = len(langs.unique())
|
99 |
-
|
100 |
-
data['multilingual'] = data.apply(lambda x: int("multilingual" in x['languages']), axis=1)
|
101 |
-
|
102 |
-
return data, no_lang_count, total_langs, langs.unique()
|
103 |
-
|
104 |
filtered_data = data.copy()
|
105 |
old_filtered_data = old_data.copy()
|
106 |
old_old_filtered_data = old_old_data.copy()
|
@@ -109,30 +68,13 @@ def main():
|
|
109 |
'Modalities',
|
110 |
["All", "NLP", "Audio", "Multimodal"])
|
111 |
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
old_old_filtered_data = old_old_filtered_data[old_old_filtered_data["modality"] == "audio"]
|
120 |
-
elif modality == "Multimodal":
|
121 |
-
filtered_data = filtered_data[filtered_data["modality"] == "multimodal"]
|
122 |
-
old_filtered_data = old_filtered_data[old_filtered_data["modality"] == "multimodal"]
|
123 |
-
old_old_filtered_data = old_old_filtered_data[old_old_filtered_data["modality"] == "multimodal"]
|
124 |
-
|
125 |
-
|
126 |
-
filtered_data, no_lang_count, total_langs, langs = process_for_lang(filtered_data)
|
127 |
-
old_filtered_data, no_lang_count_old, total_langs_old, langs_old = process_for_lang(old_filtered_data)
|
128 |
-
old_old_filtered_data, no_lang_count_old_old, total_langs_old_old, _ = process_for_lang(old_old_filtered_data)
|
129 |
-
|
130 |
-
total_samples_filtered = filtered_data.shape[0]
|
131 |
-
total_samples_old_filtered = old_filtered_data.shape[0]
|
132 |
-
total_samples_old_old_filtered = old_old_filtered_data.shape[0]
|
133 |
-
v = total_samples_filtered-no_lang_count
|
134 |
-
v_old = total_samples_old_filtered-no_lang_count_old
|
135 |
-
v_old_old = total_samples_old_old_filtered-no_lang_count_old_old
|
136 |
|
137 |
col1, col2 = st.columns(2)
|
138 |
with col1:
|
@@ -155,6 +97,7 @@ def main():
|
|
155 |
curr_change, delta = change_and_delta(total_langs_old_old, total_langs_old, total_langs)
|
156 |
st.metric(label="Total Unique Languages Rate of Change", value=curr_change, delta=delta)
|
157 |
st.text(f"New languages {set(langs)-set(langs_old)}")
|
|
|
158 |
|
159 |
st.subheader("Count of languages per model repo")
|
160 |
st.text("Some repos are for multiple languages, so the count is greater than 1")
|
@@ -162,19 +105,8 @@ def main():
|
|
162 |
'All or just Multilingual',
|
163 |
["All", "Just Multilingual", "Three or more languages"])
|
164 |
|
165 |
-
|
166 |
-
|
167 |
-
if linguality == "Just Multilingual":
|
168 |
-
multilingual_tag = data["multilingual"] == 1
|
169 |
-
multiple_lang_tags = data["language_count"] > 1
|
170 |
-
return data[multilingual_tag | multiple_lang_tags]
|
171 |
-
elif linguality == "Three or more languages":
|
172 |
-
return data[data["language_count"] >= 3]
|
173 |
-
else:
|
174 |
-
return data
|
175 |
-
|
176 |
-
models_with_langs = filter_multilinguality(filtered_data)
|
177 |
-
models_with_langs_old = filter_multilinguality(old_filtered_data)
|
178 |
|
179 |
df1 = models_with_langs['language_count'].value_counts()
|
180 |
df1_old = models_with_langs_old['language_count'].value_counts()
|
@@ -185,14 +117,6 @@ def main():
|
|
185 |
'All or filtered',
|
186 |
["All", "No English", "Remove top 10"])
|
187 |
|
188 |
-
filter = 0
|
189 |
-
if linguality_2 == "All":
|
190 |
-
filter = 0
|
191 |
-
elif linguality_2 == "No English":
|
192 |
-
filter = 1
|
193 |
-
else:
|
194 |
-
filter = 2
|
195 |
-
|
196 |
models_with_langs = filtered_data[filtered_data["language_count"] > 0]
|
197 |
langs = models_with_langs["languages"].explode()
|
198 |
langs = langs[langs != {}]
|
@@ -204,9 +128,9 @@ def main():
|
|
204 |
langs = langs[langs != {}]
|
205 |
orig_d_old = langs.value_counts().rename_axis("language").to_frame('counts').reset_index()
|
206 |
|
207 |
-
if
|
208 |
d = orig_d.iloc[1:]
|
209 |
-
elif
|
210 |
d = orig_d.iloc[10:]
|
211 |
|
212 |
# Just keep top 25 to avoid vertical scroll
|
@@ -231,31 +155,51 @@ def main():
|
|
231 |
final_data = pd.merge(
|
232 |
d, orig_d_old, how="outer", on="language"
|
233 |
)
|
234 |
-
|
235 |
-
|
236 |
-
final_data["diff"] = final_data["counts"]
|
237 |
-
|
238 |
st.dataframe(final_data)
|
239 |
|
240 |
-
|
241 |
-
|
242 |
#with tab2:
|
243 |
if tab == "License":
|
244 |
st.header("License info")
|
245 |
|
246 |
no_license_count = data["license"].isna().sum()
|
247 |
no_license_count_old = old_data["license"].isna().sum()
|
248 |
-
|
|
|
|
|
|
|
249 |
with col1:
|
250 |
v = total_samples-no_license_count
|
251 |
v_old = total_samples_old-no_license_count_old
|
252 |
st.metric(label="License Specified", value=v, delta=int(v-v_old))
|
253 |
with col2:
|
254 |
-
|
255 |
-
|
256 |
-
|
257 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
258 |
st.metric(label="Total Unique Licenses", value=unique_licenses, delta=int(unique_licenses-unique_licenses_old))
|
|
|
|
|
|
|
|
|
|
|
259 |
|
260 |
st.subheader("Distribution of licenses per model repo")
|
261 |
license_filter = st.selectbox(
|
@@ -306,81 +250,65 @@ def main():
|
|
306 |
|
307 |
tags_old = old_data["tags"].explode()
|
308 |
tags_old = tags_old[tags_old.notna()].value_counts().rename_axis("tag").to_frame('counts').reset_index()
|
309 |
-
|
310 |
-
|
311 |
-
unique_tags_old = len(
|
|
|
|
|
|
|
|
|
|
|
|
|
312 |
|
313 |
no_pipeline_count = data["pipeline"].isna().sum()
|
314 |
no_pipeline_count_old = old_data["pipeline"].isna().sum()
|
|
|
315 |
|
316 |
-
col1, col2
|
|
|
|
|
|
|
317 |
with col1:
|
318 |
-
v = total_samples-no_pipeline_count
|
319 |
-
v_old = total_samples_old-no_pipeline_count_old
|
320 |
st.metric(label="# models that have any pipeline", value=v, delta=int(v-v_old))
|
321 |
with col2:
|
|
|
|
|
|
|
|
|
|
|
322 |
st.metric(label="No pipeline Specified", value=no_pipeline_count, delta=int(no_pipeline_count-no_pipeline_count_old))
|
323 |
-
with
|
|
|
|
|
|
|
|
|
|
|
324 |
st.metric(label="Total Unique Tags", value=unique_tags, delta=int(unique_tags-unique_tags_old))
|
|
|
|
|
|
|
325 |
|
326 |
-
|
327 |
'Modalities',
|
328 |
["All", "NLP", "CV", "Audio", "RL", "Multimodal", "Tabular"])
|
329 |
|
330 |
-
filter = 0
|
331 |
-
if pipeline_filter == "All":
|
332 |
-
filter = 0
|
333 |
-
elif pipeline_filter == "NLP":
|
334 |
-
filter = 1
|
335 |
-
elif pipeline_filter == "CV":
|
336 |
-
filter = 2
|
337 |
-
elif pipeline_filter == "Audio":
|
338 |
-
filter = 3
|
339 |
-
elif pipeline_filter == "RL":
|
340 |
-
filter = 4
|
341 |
-
elif pipeline_filter == "Multimodal":
|
342 |
-
filter = 5
|
343 |
-
elif pipeline_filter == "Tabular":
|
344 |
-
filter = 6
|
345 |
-
|
346 |
st.subheader("High-level metrics")
|
347 |
-
filtered_data = data[data['pipeline'].notna()]
|
348 |
-
filtered_data_old = old_data[old_data['pipeline'].notna()]
|
349 |
-
|
350 |
-
if filter == 1:
|
351 |
-
filtered_data = data[data["modality"] == "nlp"]
|
352 |
-
filtered_data_old = old_data[old_data["modality"] == "nlp"]
|
353 |
-
elif filter == 2:
|
354 |
-
filtered_data = data[data["modality"] == "cv"]
|
355 |
-
filtered_data_old = old_data[old_data["modality"] == "cv"]
|
356 |
-
elif filter == 3:
|
357 |
-
filtered_data = data[data["modality"] == "audio"]
|
358 |
-
filtered_data_old = old_data[old_data["modality"] == "audio"]
|
359 |
-
elif filter == 4:
|
360 |
-
filtered_data = data[data["modality"] == "rl"]
|
361 |
-
filtered_data_old = old_data[old_data["modality"] == "rl"]
|
362 |
-
elif filter == 5:
|
363 |
-
filtered_data = data[data["modality"] == "multimodal"]
|
364 |
-
filtered_data_old = old_data[old_data["modality"] == "multimodal"]
|
365 |
-
elif filter == 6:
|
366 |
-
filtered_data = data[data["modality"] == "tabular"]
|
367 |
-
filtered_data_old = old_data[old_data["modality"] == "tabular"]
|
368 |
|
369 |
col1, col2, col3 = st.columns(3)
|
370 |
with col1:
|
371 |
p = st.selectbox(
|
372 |
'What pipeline do you want to see?',
|
373 |
-
["all", *
|
374 |
)
|
375 |
with col2:
|
376 |
l = st.selectbox(
|
377 |
'What library do you want to see?',
|
378 |
-
["all", "not transformers", *
|
379 |
)
|
380 |
with col3:
|
381 |
f = st.selectbox(
|
382 |
-
'What framework support?
|
383 |
-
["all", "
|
384 |
)
|
385 |
|
386 |
col1, col2 = st.columns(2)
|
@@ -393,49 +321,13 @@ def main():
|
|
393 |
o = st.selectbox(
|
394 |
label="Operation (for tags)",
|
395 |
options=["Any", "All", "None"]
|
396 |
-
)
|
397 |
-
|
398 |
-
def filter_fn(row):
|
399 |
-
tags = row["tags"]
|
400 |
-
tags[:] = [d for d in tags if isinstance(d, str)]
|
401 |
-
if o == "All":
|
402 |
-
if all(elem in tags for elem in filt):
|
403 |
-
return True
|
404 |
-
|
405 |
-
s1 = set(tags)
|
406 |
-
s2 = set(filt)
|
407 |
-
if o == "Any":
|
408 |
-
if bool(s1 & s2):
|
409 |
-
return True
|
410 |
-
if o == "None":
|
411 |
-
if len(s1.intersection(s2)) == 0:
|
412 |
-
return True
|
413 |
-
return False
|
414 |
|
|
|
|
|
|
|
|
|
415 |
|
416 |
-
if p != "all":
|
417 |
-
filtered_data = filtered_data[filtered_data["pipeline"] == p]
|
418 |
-
filtered_data_old = filtered_data_old[filtered_data_old["pipeline"] == p]
|
419 |
-
if l != "all" and l != "not transformers":
|
420 |
-
filtered_data = filtered_data[filtered_data["library"] == l]
|
421 |
-
filtered_data_old = filtered_data_old[filtered_data_old["library"] == l]
|
422 |
-
if l == "not transformers":
|
423 |
-
filtered_data = filtered_data[filtered_data["library"] != "transformers"]
|
424 |
-
filtered_data_old = filtered_data_old[filtered_data_old["library"] != "transformers"]
|
425 |
-
if f != "all":
|
426 |
-
if f == "py":
|
427 |
-
filtered_data = filtered_data[filtered_data["pytorch"] == 1]
|
428 |
-
filtered_data_old = filtered_data_old[filtered_data_old["pytorch"] == 1]
|
429 |
-
elif f == "tf":
|
430 |
-
filtered_data = filtered_data[filtered_data["tensorflow"] == 1]
|
431 |
-
filtered_data_old = filtered_data_old[filtered_data_old["tensorflow"] == 1]
|
432 |
-
elif f == "jax":
|
433 |
-
filtered_data = filtered_data[filtered_data["jax"] == 1]
|
434 |
-
filtered_data_old = filtered_data_old[filtered_data_old["jax"] == 1]
|
435 |
-
if filt != []:
|
436 |
-
filtered_data = filtered_data[filtered_data.apply(filter_fn, axis=1)]
|
437 |
-
filtered_data_old = filtered_data_old[filtered_data_old.apply(filter_fn, axis=1)]
|
438 |
-
|
439 |
|
440 |
d = filtered_data["pipeline"].value_counts().rename_axis("pipeline").to_frame('counts').reset_index()
|
441 |
columns_of_interest = ["downloads_30d", "likes", "pytorch", "tensorflow", "jax"]
|
@@ -443,23 +335,45 @@ def main():
|
|
443 |
final_data = pd.merge(
|
444 |
d, grouped_data, how="outer", on="pipeline"
|
445 |
)
|
446 |
-
sums = grouped_data.sum()
|
447 |
|
448 |
d_old = filtered_data_old["pipeline"].value_counts().rename_axis("pipeline").to_frame('counts').reset_index()
|
449 |
grouped_data_old = filtered_data_old.groupby("pipeline").sum()[columns_of_interest]
|
450 |
final_data_old = pd.merge(
|
451 |
d_old, grouped_data_old, how="outer", on="pipeline"
|
452 |
)
|
|
|
|
|
|
|
|
|
453 |
sums = grouped_data.sum()
|
454 |
sums_old = grouped_data_old.sum()
|
|
|
455 |
|
456 |
-
col1, col2, col3 = st.columns(
|
|
|
|
|
|
|
457 |
with col1:
|
458 |
-
st.metric(label="Total models", value=
|
459 |
with col2:
|
460 |
-
|
|
|
461 |
with col3:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
462 |
st.metric(label="Cumulative likes", value=sums["likes"], delta=int(sums["likes"] - sums_old["likes"]))
|
|
|
|
|
|
|
|
|
463 |
|
464 |
col1, col2, col3 = st.columns(3)
|
465 |
with col1:
|
@@ -469,9 +383,41 @@ def main():
|
|
469 |
with col3:
|
470 |
st.metric(label="Total in JAX", value=sums["jax"], delta=int(sums["jax"] - sums_old["jax"]))
|
471 |
|
472 |
-
|
|
|
|
|
|
|
|
|
473 |
|
474 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
475 |
|
476 |
st.subheader("Count of models per pipeline")
|
477 |
st.write(alt.Chart(d).mark_bar().encode(
|
@@ -511,8 +457,6 @@ def main():
|
|
511 |
"downloads_30d", "likes", "pytorch", "tensorflow", "jax"]
|
512 |
raw_data = filtered_data[columns_of_interest]
|
513 |
st.dataframe(raw_data)
|
514 |
-
|
515 |
-
|
516 |
|
517 |
# todo : add activity metric
|
518 |
|
@@ -524,6 +468,7 @@ def main():
|
|
524 |
columns_of_interest = ["prs_count", "prs_open", "prs_merged", "prs_closed", "discussions_count", "discussions_open", "discussions_closed"]
|
525 |
sums = data[columns_of_interest].sum()
|
526 |
sums_old = old_data[columns_of_interest].sum()
|
|
|
527 |
|
528 |
col1, col2, col3, col4 = st.columns(4)
|
529 |
with col1:
|
@@ -535,6 +480,20 @@ def main():
|
|
535 |
with col4:
|
536 |
st.metric(label="PRs closed", value=sums["prs_closed"], delta=int(sums["prs_closed"] - sums_old["prs_closed"]))
|
537 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
538 |
col1, col2, col3 = st.columns(3)
|
539 |
with col1:
|
540 |
st.metric(label="Total discussions", value=sums["discussions_count"], delta=int(sums["discussions_count"] - sums_old["discussions_count"]))
|
@@ -543,6 +502,17 @@ def main():
|
|
543 |
with col3:
|
544 |
st.metric(label="Discussions closed", value=sums["discussions_closed"], delta=int(sums["discussions_closed"] - sums_old["discussions_closed"]))
|
545 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
546 |
filtered_data = data[["repo_id", "prs_count", "prs_open", "prs_merged", "prs_closed", "discussions_count", "discussions_open", "discussions_closed"]].sort_values("prs_count", ascending=False).reset_index(drop=True)
|
547 |
st.dataframe(filtered_data)
|
548 |
|
@@ -552,6 +522,7 @@ def main():
|
|
552 |
|
553 |
no_library_count = data["library"].isna().sum()
|
554 |
no_library_count_old = old_data["library"].isna().sum()
|
|
|
555 |
col1, col2, col3 = st.columns(3)
|
556 |
with col1:
|
557 |
v = total_samples-no_library_count
|
@@ -564,6 +535,22 @@ def main():
|
|
564 |
v_old = len(old_data["library"].unique())
|
565 |
st.metric(label="Total Unique library", value=v, delta=int(v-v_old))
|
566 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
567 |
|
568 |
st.subheader("High-level metrics")
|
569 |
filtered_data = data[data['library'].notna()]
|
@@ -623,8 +610,6 @@ def main():
|
|
623 |
y=alt.X('library', sort=None)
|
624 |
))
|
625 |
|
626 |
-
|
627 |
-
|
628 |
st.subheader("Aggregated Data")
|
629 |
final_data = pd.merge(
|
630 |
final_data, final_data_old, how="outer", on="library"
|
@@ -647,6 +632,7 @@ def main():
|
|
647 |
columns_of_interest = ["has_model_index", "has_metadata", "has_text", "text_length"]
|
648 |
rows = data.shape[0]
|
649 |
rows_old = old_data.shape[0]
|
|
|
650 |
|
651 |
cond = data["has_model_index"] | data["has_text"]
|
652 |
with_model_card = data[cond]
|
@@ -656,31 +642,58 @@ def main():
|
|
656 |
with_model_card_old = old_data[cond]
|
657 |
c_model_card_old = with_model_card_old.shape[0]
|
658 |
|
|
|
|
|
|
|
|
|
659 |
st.subheader("High-level metrics")
|
660 |
-
col1, col2, col3 = st.columns(
|
661 |
with col1:
|
662 |
-
st.metric(label="#
|
663 |
with col2:
|
664 |
-
|
665 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
666 |
with_index = data["has_model_index"].sum()
|
667 |
with_index_old = old_data["has_model_index"].sum()
|
|
|
668 |
with col1:
|
669 |
-
st.metric(label="#
|
670 |
with col2:
|
671 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
672 |
|
673 |
with_text = data["has_text"]
|
674 |
with_text_old = old_data["has_text"]
|
|
|
|
|
|
|
|
|
|
|
675 |
with col1:
|
676 |
-
st.metric(label="#
|
677 |
with col2:
|
678 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
679 |
|
680 |
-
|
681 |
st.subheader("Length (chars) of model card content")
|
682 |
-
fig,
|
683 |
-
|
684 |
st.metric(label="# average length of model card (chars)", value=data[with_text]["text_length"].mean())
|
685 |
st.pyplot(fig)
|
686 |
|
|
|
4 |
import altair as alt
|
5 |
import matplotlib.pyplot as plt
|
6 |
|
7 |
+
from utils import process_dataset, eval_tags, change_and_delta
|
8 |
+
from language import process_for_lang, filter_multilinguality
|
9 |
+
from pipelines import filter_pipeline_data
|
10 |
|
11 |
def main():
|
12 |
# Pick revision at top
|
|
|
28 |
supported_revisions,
|
29 |
index=2)
|
30 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
# Process dataset
|
32 |
old_old_data = process_dataset(base_old)
|
33 |
old_data = process_dataset(base)
|
|
|
55 |
|
56 |
tab = st.selectbox(
|
57 |
'Topic of interest',
|
58 |
+
["Language","License", "Pipeline", "Discussion Features", "Libraries", "Model Cards", "Super Users", "Raw Data"])
|
59 |
|
60 |
if tab == "Language":
|
61 |
st.header("Languages info")
|
62 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
filtered_data = data.copy()
|
64 |
old_filtered_data = old_data.copy()
|
65 |
old_old_filtered_data = old_old_data.copy()
|
|
|
68 |
'Modalities',
|
69 |
["All", "NLP", "Audio", "Multimodal"])
|
70 |
|
71 |
+
filtered_data, no_lang_count, total_langs, langs = process_for_lang(filtered_data, modality)
|
72 |
+
old_filtered_data, no_lang_count_old, total_langs_old, langs_old = process_for_lang(old_filtered_data, modality)
|
73 |
+
old_old_filtered_data, no_lang_count_old_old, total_langs_old_old, _ = process_for_lang(old_old_filtered_data, modality)
|
74 |
+
|
75 |
+
v = filtered_data.shape[0]-no_lang_count
|
76 |
+
v_old = old_filtered_data.shape[0]-no_lang_count_old
|
77 |
+
v_old_old = old_old_filtered_data.shape[0]-no_lang_count_old_old
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
78 |
|
79 |
col1, col2 = st.columns(2)
|
80 |
with col1:
|
|
|
97 |
curr_change, delta = change_and_delta(total_langs_old_old, total_langs_old, total_langs)
|
98 |
st.metric(label="Total Unique Languages Rate of Change", value=curr_change, delta=delta)
|
99 |
st.text(f"New languages {set(langs)-set(langs_old)}")
|
100 |
+
st.text(f"Lost languages {set(langs_old)-set(langs)}")
|
101 |
|
102 |
st.subheader("Count of languages per model repo")
|
103 |
st.text("Some repos are for multiple languages, so the count is greater than 1")
|
|
|
105 |
'All or just Multilingual',
|
106 |
["All", "Just Multilingual", "Three or more languages"])
|
107 |
|
108 |
+
models_with_langs = filter_multilinguality(filtered_data, linguality)
|
109 |
+
models_with_langs_old = filter_multilinguality(old_filtered_data, linguality)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
110 |
|
111 |
df1 = models_with_langs['language_count'].value_counts()
|
112 |
df1_old = models_with_langs_old['language_count'].value_counts()
|
|
|
117 |
'All or filtered',
|
118 |
["All", "No English", "Remove top 10"])
|
119 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
120 |
models_with_langs = filtered_data[filtered_data["language_count"] > 0]
|
121 |
langs = models_with_langs["languages"].explode()
|
122 |
langs = langs[langs != {}]
|
|
|
128 |
langs = langs[langs != {}]
|
129 |
orig_d_old = langs.value_counts().rename_axis("language").to_frame('counts').reset_index()
|
130 |
|
131 |
+
if linguality_2 == "No English":
|
132 |
d = orig_d.iloc[1:]
|
133 |
+
elif linguality_2 == "Remove top 10":
|
134 |
d = orig_d.iloc[10:]
|
135 |
|
136 |
# Just keep top 25 to avoid vertical scroll
|
|
|
155 |
final_data = pd.merge(
|
156 |
d, orig_d_old, how="outer", on="language"
|
157 |
)
|
158 |
+
final_data['counts'] = final_data['counts'].fillna(0).astype(int)
|
159 |
+
final_data['old_c'] = final_data['old_c'].fillna(0).astype(int)
|
160 |
+
final_data["diff"] = final_data["counts"] - final_data["old_c"]
|
161 |
+
final_data['language'] = final_data['language'].astype(str)
|
162 |
st.dataframe(final_data)
|
163 |
|
|
|
|
|
164 |
#with tab2:
|
165 |
if tab == "License":
|
166 |
st.header("License info")
|
167 |
|
168 |
no_license_count = data["license"].isna().sum()
|
169 |
no_license_count_old = old_data["license"].isna().sum()
|
170 |
+
no_license_count_old_old = old_old_data["license"].isna().sum()
|
171 |
+
|
172 |
+
|
173 |
+
col1, col2 = st.columns(2)
|
174 |
with col1:
|
175 |
v = total_samples-no_license_count
|
176 |
v_old = total_samples_old-no_license_count_old
|
177 |
st.metric(label="License Specified", value=v, delta=int(v-v_old))
|
178 |
with col2:
|
179 |
+
v = total_samples-no_license_count
|
180 |
+
v_old = total_samples_old-no_license_count_old
|
181 |
+
v_old_old = total_samples_old-no_license_count_old_old
|
182 |
+
curr_change, delta = change_and_delta(v_old_old, v_old, v)
|
183 |
+
st.metric(label="License Specified Rate of Change", value=curr_change, delta=delta)
|
184 |
+
|
185 |
+
col1, col2 = st.columns(2)
|
186 |
+
with col1:
|
187 |
+
st.metric(label="No License Specified", value=no_license_count, delta=int(no_license_count-no_license_count_old))
|
188 |
+
with col2:
|
189 |
+
curr_change, delta = change_and_delta(no_license_count_old_old, no_license_count_old, no_license_count)
|
190 |
+
st.metric(label="No License Specified Rate of Change", value=curr_change, delta=delta)
|
191 |
+
|
192 |
+
col1, col2 = st.columns(2)
|
193 |
+
unique_licenses = len(data["license"].unique())
|
194 |
+
unique_licenses_old = len(old_data["license"].unique())
|
195 |
+
unique_licenses_old_old = len(old_old_data["license"].unique())
|
196 |
+
with col1:
|
197 |
st.metric(label="Total Unique Licenses", value=unique_licenses, delta=int(unique_licenses-unique_licenses_old))
|
198 |
+
with col2:
|
199 |
+
curr_change, delta = change_and_delta(unique_licenses_old_old, unique_licenses_old, unique_licenses)
|
200 |
+
st.metric(label="Total Unique Licenses Rate of Change", value=curr_change, delta=delta)
|
201 |
+
st.text(f"New licenses {set(data['license'].unique())-set(old_data['license'].unique())}")
|
202 |
+
st.text(f"Old licenses {set(old_data['license'].unique())-set(data['license'].unique())}")
|
203 |
|
204 |
st.subheader("Distribution of licenses per model repo")
|
205 |
license_filter = st.selectbox(
|
|
|
250 |
|
251 |
tags_old = old_data["tags"].explode()
|
252 |
tags_old = tags_old[tags_old.notna()].value_counts().rename_axis("tag").to_frame('counts').reset_index()
|
253 |
+
s_o = tags_old["tag"]
|
254 |
+
s_o = s_o[s_o.apply(type) == str]
|
255 |
+
unique_tags_old = len(s_o.unique())
|
256 |
+
|
257 |
+
tags_old_old = old_old_data["tags"].explode()
|
258 |
+
tags_old_old = tags_old_old[tags_old_old.notna()].value_counts().rename_axis("tag").to_frame('counts').reset_index()
|
259 |
+
s_old_old = tags_old_old["tag"]
|
260 |
+
s_old_old = s_old_old[s_old_old.apply(type) == str]
|
261 |
+
unique_tags_old_old = len(s_old_old.unique())
|
262 |
|
263 |
no_pipeline_count = data["pipeline"].isna().sum()
|
264 |
no_pipeline_count_old = old_data["pipeline"].isna().sum()
|
265 |
+
no_pipeline_count_old_old = old_old_data["pipeline"].isna().sum()
|
266 |
|
267 |
+
col1, col2 = st.columns(2)
|
268 |
+
v = total_samples-no_pipeline_count
|
269 |
+
v_old = total_samples_old-no_pipeline_count_old
|
270 |
+
v_old_old = total_samples_old_old-no_pipeline_count_old_old
|
271 |
with col1:
|
|
|
|
|
272 |
st.metric(label="# models that have any pipeline", value=v, delta=int(v-v_old))
|
273 |
with col2:
|
274 |
+
curr_change, delta = change_and_delta(v_old_old, v_old, v)
|
275 |
+
st.metric(label="# models rate of change", value=curr_change, delta=delta)
|
276 |
+
|
277 |
+
col1, col2 = st.columns(2)
|
278 |
+
with col1:
|
279 |
st.metric(label="No pipeline Specified", value=no_pipeline_count, delta=int(no_pipeline_count-no_pipeline_count_old))
|
280 |
+
with col2:
|
281 |
+
curr_change, delta = change_and_delta(no_pipeline_count_old_old, no_pipeline_count_old, no_pipeline_count)
|
282 |
+
st.metric(label="No pipeline Specified rate of change", value=curr_change, delta=delta)
|
283 |
+
|
284 |
+
col1, col2 = st.columns(2)
|
285 |
+
with col1:
|
286 |
st.metric(label="Total Unique Tags", value=unique_tags, delta=int(unique_tags-unique_tags_old))
|
287 |
+
with col2:
|
288 |
+
curr_change, delta = change_and_delta(unique_tags_old_old, unique_tags_old, unique_tags)
|
289 |
+
st.metric(label="Total Unique Tags", value=curr_change, delta=delta)
|
290 |
|
291 |
+
modality_filter = st.selectbox(
|
292 |
'Modalities',
|
293 |
["All", "NLP", "CV", "Audio", "RL", "Multimodal", "Tabular"])
|
294 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
295 |
st.subheader("High-level metrics")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
296 |
|
297 |
col1, col2, col3 = st.columns(3)
|
298 |
with col1:
|
299 |
p = st.selectbox(
|
300 |
'What pipeline do you want to see?',
|
301 |
+
["all", *data["pipeline"].unique()]
|
302 |
)
|
303 |
with col2:
|
304 |
l = st.selectbox(
|
305 |
'What library do you want to see?',
|
306 |
+
["all", "not transformers", *data["library"].unique()]
|
307 |
)
|
308 |
with col3:
|
309 |
f = st.selectbox(
|
310 |
+
'What trf framework support?',
|
311 |
+
["all", "pytorch", "tensorflow", "jax"]
|
312 |
)
|
313 |
|
314 |
col1, col2 = st.columns(2)
|
|
|
321 |
o = st.selectbox(
|
322 |
label="Operation (for tags)",
|
323 |
options=["Any", "All", "None"]
|
324 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
325 |
|
326 |
+
filtered_data, tags = filter_pipeline_data(data, modality_filter, p, l, f, filt, o)
|
327 |
+
filtered_data_old, old_tags = filter_pipeline_data(old_data, modality_filter, p, l, f, filt, o)
|
328 |
+
filtered_data_old_old, old_old_tags = filter_pipeline_data(old_old_data, modality_filter, p, l, f, filt, o)
|
329 |
+
st.subheader("Pipeline breakdown")
|
330 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
331 |
|
332 |
d = filtered_data["pipeline"].value_counts().rename_axis("pipeline").to_frame('counts').reset_index()
|
333 |
columns_of_interest = ["downloads_30d", "likes", "pytorch", "tensorflow", "jax"]
|
|
|
335 |
final_data = pd.merge(
|
336 |
d, grouped_data, how="outer", on="pipeline"
|
337 |
)
|
|
|
338 |
|
339 |
d_old = filtered_data_old["pipeline"].value_counts().rename_axis("pipeline").to_frame('counts').reset_index()
|
340 |
grouped_data_old = filtered_data_old.groupby("pipeline").sum()[columns_of_interest]
|
341 |
final_data_old = pd.merge(
|
342 |
d_old, grouped_data_old, how="outer", on="pipeline"
|
343 |
)
|
344 |
+
|
345 |
+
d_old = filtered_data_old_old["pipeline"].value_counts().rename_axis("pipeline").to_frame('counts').reset_index()
|
346 |
+
grouped_data_old_old = filtered_data_old_old.groupby("pipeline").sum()[columns_of_interest]
|
347 |
+
|
348 |
sums = grouped_data.sum()
|
349 |
sums_old = grouped_data_old.sum()
|
350 |
+
sums_old_old = grouped_data_old_old.sum()
|
351 |
|
352 |
+
col1, col2, col3, col4 = st.columns(4)
|
353 |
+
v = filtered_data.shape[0]
|
354 |
+
v_old = filtered_data_old.shape[0]
|
355 |
+
v_old_old = filtered_data_old_old.shape[0]
|
356 |
with col1:
|
357 |
+
st.metric(label="Total models", value=v, delta=int(v - v_old))
|
358 |
with col2:
|
359 |
+
curr_change, delta = change_and_delta(v_old_old, v_old, v)
|
360 |
+
st.metric(label="Total models rate of change", value=curr_change, delta=delta)
|
361 |
with col3:
|
362 |
+
st.metric(label="Cumulative Downloads (30d)", value=sums["downloads_30d"], delta=int(sums["downloads_30d"] - sums_old["downloads_30d"]))
|
363 |
+
with col4:
|
364 |
+
print(sums_old_old["downloads_30d"], sums_old["downloads_30d"], sums["downloads_30d"])
|
365 |
+
curr_change, delta = change_and_delta(sums_old_old["downloads_30d"], sums_old["downloads_30d"], sums["downloads_30d"])
|
366 |
+
st.metric(label="Cumulative Downloads (30d) rate of change", value=curr_change, delta=delta)
|
367 |
+
|
368 |
+
col1, col2, col3 = st.columns(3)
|
369 |
+
with col1:
|
370 |
+
st.metric(label="Total unique pipelines", value=len(filtered_data["pipeline"].unique()))
|
371 |
+
with col2:
|
372 |
st.metric(label="Cumulative likes", value=sums["likes"], delta=int(sums["likes"] - sums_old["likes"]))
|
373 |
+
with col3:
|
374 |
+
curr_change, delta = change_and_delta(sums_old_old["likes"], sums_old["likes"], sums["likes"])
|
375 |
+
st.metric(label="Cumulative Likes rate of change", value=curr_change, delta=delta)
|
376 |
+
|
377 |
|
378 |
col1, col2, col3 = st.columns(3)
|
379 |
with col1:
|
|
|
383 |
with col3:
|
384 |
st.metric(label="Total in JAX", value=sums["jax"], delta=int(sums["jax"] - sums_old["jax"]))
|
385 |
|
386 |
+
col1, col2 = st.columns(2)
|
387 |
+
with col1:
|
388 |
+
st.metric(label="Total unique libraries", value=len(filtered_data["library"].unique()))
|
389 |
+
with col2:
|
390 |
+
st.metric(label="Total unique modality", value=len(filtered_data["modality"].unique()))
|
391 |
|
392 |
+
|
393 |
+
col1, col2 = st.columns(2)
|
394 |
+
with col1:
|
395 |
+
st.metric(label="Total transformers models", value=len(filtered_data[filtered_data["library"] == "transformers"]))
|
396 |
+
with col2:
|
397 |
+
st.metric(label="Total non transformers models", value=len(filtered_data[filtered_data["library"] != "transformers"]))
|
398 |
+
|
399 |
+
st.metric(label="Unique Tags", value=len(tags), delta=int(len(tags) - len(old_tags)))
|
400 |
+
st.text(f"New tags {set(tags)-set(old_tags)}")
|
401 |
+
st.text(f"Lost tags {set(old_tags)-set(tags)}")
|
402 |
+
|
403 |
+
st.subheader("Pipeline breakdown by modality")
|
404 |
+
col1, col2 = st.columns(2)
|
405 |
+
with col1:
|
406 |
+
st.metric(label="Total CV models", value=len(filtered_data[filtered_data["modality"] == "cv"]))
|
407 |
+
with col2:
|
408 |
+
st.metric(label="Total NLP models", value=len(filtered_data[filtered_data["modality"] == "nlp"]))
|
409 |
+
|
410 |
+
col1, col2 = st.columns(2)
|
411 |
+
with col1:
|
412 |
+
st.metric(label="Total Audio models", value=len(filtered_data[filtered_data["modality"] == "audio"]))
|
413 |
+
with col2:
|
414 |
+
st.metric(label="Total RL models", value=len(filtered_data[filtered_data["modality"] == "rl"]))
|
415 |
+
|
416 |
+
col1, col2 = st.columns(2)
|
417 |
+
with col1:
|
418 |
+
st.metric(label="Total Tabular models", value=len(filtered_data[filtered_data["modality"] == "tabular"]))
|
419 |
+
with col2:
|
420 |
+
st.metric(label="Total Multimodal models", value=len(filtered_data[filtered_data["modality"] == "multimodal"]))
|
421 |
|
422 |
st.subheader("Count of models per pipeline")
|
423 |
st.write(alt.Chart(d).mark_bar().encode(
|
|
|
457 |
"downloads_30d", "likes", "pytorch", "tensorflow", "jax"]
|
458 |
raw_data = filtered_data[columns_of_interest]
|
459 |
st.dataframe(raw_data)
|
|
|
|
|
460 |
|
461 |
# todo : add activity metric
|
462 |
|
|
|
468 |
columns_of_interest = ["prs_count", "prs_open", "prs_merged", "prs_closed", "discussions_count", "discussions_open", "discussions_closed"]
|
469 |
sums = data[columns_of_interest].sum()
|
470 |
sums_old = old_data[columns_of_interest].sum()
|
471 |
+
sums_old_old = old_old_data[columns_of_interest].sum()
|
472 |
|
473 |
col1, col2, col3, col4 = st.columns(4)
|
474 |
with col1:
|
|
|
480 |
with col4:
|
481 |
st.metric(label="PRs closed", value=sums["prs_closed"], delta=int(sums["prs_closed"] - sums_old["prs_closed"]))
|
482 |
|
483 |
+
col1, col2, col3, col4 = st.columns(4)
|
484 |
+
with col1:
|
485 |
+
curr_change, delta = change_and_delta(sums_old_old["prs_count"], sums_old["prs_count"], sums["prs_count"])
|
486 |
+
st.metric(label="Total PRs change", value=curr_change,delta=delta)
|
487 |
+
with col2:
|
488 |
+
curr_change, delta = change_and_delta(sums_old_old["prs_open"], sums_old["prs_open"], sums["prs_open"])
|
489 |
+
st.metric(label="PRs opened change", value=curr_change,delta=delta)
|
490 |
+
with col3:
|
491 |
+
curr_change, delta = change_and_delta(sums_old_old["prs_merged"], sums_old["prs_merged"], sums["prs_merged"])
|
492 |
+
st.metric(label="PRs merged change", value=curr_change,delta=delta)
|
493 |
+
with col4:
|
494 |
+
curr_change, delta = change_and_delta(sums_old_old["prs_closed"], sums_old["prs_closed"], sums["prs_closed"])
|
495 |
+
st.metric(label="PRs closed change", value=curr_change,delta=delta)
|
496 |
+
|
497 |
col1, col2, col3 = st.columns(3)
|
498 |
with col1:
|
499 |
st.metric(label="Total discussions", value=sums["discussions_count"], delta=int(sums["discussions_count"] - sums_old["discussions_count"]))
|
|
|
502 |
with col3:
|
503 |
st.metric(label="Discussions closed", value=sums["discussions_closed"], delta=int(sums["discussions_closed"] - sums_old["discussions_closed"]))
|
504 |
|
505 |
+
col1, col2, col3 = st.columns(3)
|
506 |
+
with col1:
|
507 |
+
curr_change, delta = change_and_delta(sums_old_old["discussions_count"], sums_old["discussions_count"], sums["discussions_count"])
|
508 |
+
st.metric(label="Total discussions change", value=curr_change,delta=delta)
|
509 |
+
with col2:
|
510 |
+
curr_change, delta = change_and_delta(sums_old_old["discussions_open"], sums_old["discussions_open"], sums["discussions_open"])
|
511 |
+
st.metric(label="Discussions open change", value=curr_change,delta=delta)
|
512 |
+
with col3:
|
513 |
+
curr_change, delta = change_and_delta(sums_old_old["discussions_closed"], sums_old["discussions_closed"], sums["discussions_closed"])
|
514 |
+
st.metric(label="Discussions closed change", value=curr_change,delta=delta)
|
515 |
+
|
516 |
filtered_data = data[["repo_id", "prs_count", "prs_open", "prs_merged", "prs_closed", "discussions_count", "discussions_open", "discussions_closed"]].sort_values("prs_count", ascending=False).reset_index(drop=True)
|
517 |
st.dataframe(filtered_data)
|
518 |
|
|
|
522 |
|
523 |
no_library_count = data["library"].isna().sum()
|
524 |
no_library_count_old = old_data["library"].isna().sum()
|
525 |
+
no_library_count_old_old = old_old_data["library"].isna().sum()
|
526 |
col1, col2, col3 = st.columns(3)
|
527 |
with col1:
|
528 |
v = total_samples-no_library_count
|
|
|
535 |
v_old = len(old_data["library"].unique())
|
536 |
st.metric(label="Total Unique library", value=v, delta=int(v-v_old))
|
537 |
|
538 |
+
col1, col2, col3 = st.columns(3)
|
539 |
+
with col1:
|
540 |
+
v = total_samples-no_library_count
|
541 |
+
v_old = total_samples_old-no_library_count_old
|
542 |
+
v_old_old = total_samples_old_old-no_library_count_old_old
|
543 |
+
curr_change, delta = change_and_delta(v_old_old, v_old, v)
|
544 |
+
st.metric(label="# models that have any library change", value=curr_change, delta=delta)
|
545 |
+
with col2:
|
546 |
+
curr_change, delta = change_and_delta(no_library_count_old_old, no_library_count_old, no_library_count)
|
547 |
+
st.metric(label="No library Specified Change", value=curr_change, delta=delta)
|
548 |
+
with col3:
|
549 |
+
v = len(data["library"].unique())
|
550 |
+
v_old = len(old_data["library"].unique())
|
551 |
+
v_old_old = len(old_old_data["library"].unique())
|
552 |
+
curr_change, delta = change_and_delta(v_old_old, v_old, v)
|
553 |
+
st.metric(label="Total Unique library", value=curr_change, delta=delta)
|
554 |
|
555 |
st.subheader("High-level metrics")
|
556 |
filtered_data = data[data['library'].notna()]
|
|
|
610 |
y=alt.X('library', sort=None)
|
611 |
))
|
612 |
|
|
|
|
|
613 |
st.subheader("Aggregated Data")
|
614 |
final_data = pd.merge(
|
615 |
final_data, final_data_old, how="outer", on="library"
|
|
|
632 |
columns_of_interest = ["has_model_index", "has_metadata", "has_text", "text_length"]
|
633 |
rows = data.shape[0]
|
634 |
rows_old = old_data.shape[0]
|
635 |
+
rows_old_old = old_old_data.shape[0]
|
636 |
|
637 |
cond = data["has_model_index"] | data["has_text"]
|
638 |
with_model_card = data[cond]
|
|
|
642 |
with_model_card_old = old_data[cond]
|
643 |
c_model_card_old = with_model_card_old.shape[0]
|
644 |
|
645 |
+
cond = old_old_data["has_model_index"] | old_old_data["has_text"]
|
646 |
+
with_model_card_old_old = old_old_data[cond]
|
647 |
+
c_model_card_old_old = with_model_card_old_old.shape[0]
|
648 |
+
|
649 |
st.subheader("High-level metrics")
|
650 |
+
col1, col2, col3, col4 = st.columns(4)
|
651 |
with col1:
|
652 |
+
st.metric(label="# with model card file", value=c_model_card, delta=int(c_model_card-c_model_card_old))
|
653 |
with col2:
|
654 |
+
curr_change, delta = change_and_delta(c_model_card_old_old, c_model_card_old, c_model_card)
|
655 |
+
st.metric(label="# with model card file change", value=curr_change, delta=delta)
|
656 |
+
with col3:
|
657 |
+
st.metric(label="# without model card file", value=rows-c_model_card, delta=int((rows-c_model_card)-(rows_old-c_model_card_old)))
|
658 |
+
with col4:
|
659 |
+
curr_change, delta = change_and_delta(rows_old_old-c_model_card_old_old, rows_old-c_model_card_old, rows-c_model_card)
|
660 |
+
st.metric(label="# without model card file change", value=curr_change, delta=delta)
|
661 |
+
|
662 |
with_index = data["has_model_index"].sum()
|
663 |
with_index_old = old_data["has_model_index"].sum()
|
664 |
+
with_index_old_old = old_old_data["has_model_index"].sum()
|
665 |
with col1:
|
666 |
+
st.metric(label="# with model index", value=with_index, delta=int(with_index-with_index_old))
|
667 |
with col2:
|
668 |
+
curr_change, delta = change_and_delta(with_index_old_old, with_index_old, with_index)
|
669 |
+
st.metric(label="# with model index change", value=curr_change, delta=delta)
|
670 |
+
with col3:
|
671 |
+
st.metric(label="# without model index", value=rows-with_index, delta=int((rows-with_index)-(rows_old-with_index_old)))
|
672 |
+
with col4:
|
673 |
+
curr_change, delta = change_and_delta(rows_old_old-with_index_old_old, rows_old-with_index_old, rows-with_index)
|
674 |
+
st.metric(label="# without model index change", value=curr_change, delta=delta)
|
675 |
|
676 |
with_text = data["has_text"]
|
677 |
with_text_old = old_data["has_text"]
|
678 |
+
with_text_old_old = old_old_data["has_text"]
|
679 |
+
|
680 |
+
with_text_sum = with_text.sum()
|
681 |
+
with_text_old_sum = with_text_old.sum()
|
682 |
+
with_text_old_old_sum = with_text_old_old.sum()
|
683 |
with col1:
|
684 |
+
st.metric(label="# with model card text", value=with_text_sum, delta=int(with_text_sum-with_text_old_sum))
|
685 |
with col2:
|
686 |
+
curr_change, delta = change_and_delta(with_text_old_old_sum, with_text_old_sum, with_text_sum)
|
687 |
+
st.metric(label="# with model card text change", value=curr_change, delta=delta)
|
688 |
+
with col3:
|
689 |
+
st.metric(label="# without card text", value=rows-with_text_sum, delta=int((rows-with_text_sum)-(with_text_old_sum)))
|
690 |
+
with col4:
|
691 |
+
curr_change, delta = change_and_delta(rows_old_old-with_text_old_old_sum, rows_old-with_text_old_sum, rows-with_text_sum)
|
692 |
+
st.metric(label="# without card text change", value=curr_change, delta=delta)
|
693 |
|
|
|
694 |
st.subheader("Length (chars) of model card content")
|
695 |
+
fig, _ = plt.subplots()
|
696 |
+
_ = data["length_bins"].value_counts().plot.bar()
|
697 |
st.metric(label="# average length of model card (chars)", value=data[with_text]["text_length"].mean())
|
698 |
st.pyplot(fig)
|
699 |
|
pipelines.py
ADDED
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
def filter_tags(row, filt, operator):
|
2 |
+
tags = row["tags"]
|
3 |
+
tags[:] = [d for d in tags if isinstance(d, str)]
|
4 |
+
if operator == "All":
|
5 |
+
if all(elem in tags for elem in filt):
|
6 |
+
return True
|
7 |
+
|
8 |
+
s1 = set(tags)
|
9 |
+
s2 = set(filt)
|
10 |
+
if operator == "Any":
|
11 |
+
if bool(s1 & s2):
|
12 |
+
return True
|
13 |
+
if operator == "None":
|
14 |
+
if len(s1.intersection(s2)) == 0:
|
15 |
+
return True
|
16 |
+
return False
|
17 |
+
|
18 |
+
|
19 |
+
def filter_pipeline_data(data, modality, pipeline, library,framework, tags, operator):
|
20 |
+
data = data[data['pipeline'].notna()]
|
21 |
+
|
22 |
+
if modality != "All":
|
23 |
+
data = data[data["modality"] == modality.lower()]
|
24 |
+
|
25 |
+
if pipeline != "all":
|
26 |
+
data = data[data["pipeline"] == pipeline]
|
27 |
+
|
28 |
+
if library != "all" and library != "not transformers":
|
29 |
+
data = data[data["library"] == library]
|
30 |
+
if library == "not transformers":
|
31 |
+
data = data[data["library"] != "transformers"]
|
32 |
+
|
33 |
+
if framework != "all":
|
34 |
+
print(framework)
|
35 |
+
data = data[data[framework] == 1]
|
36 |
+
|
37 |
+
if tags != []:
|
38 |
+
data = data[data.apply(filter_tags, axis=1, filt=tags, operator=operator)]
|
39 |
+
|
40 |
+
tags = data["tags"].explode()
|
41 |
+
tags = tags[tags.notna()].value_counts().rename_axis("tag").to_frame('counts').reset_index()
|
42 |
+
s = tags["tag"]
|
43 |
+
s = s[s.apply(type) == str]
|
44 |
+
|
45 |
+
return data, s.unique()
|
utils.py
CHANGED
@@ -58,12 +58,15 @@ def eval_tags(row):
|
|
58 |
return val
|
59 |
|
60 |
def change_pct(old, new):
|
|
|
|
|
61 |
return round(100* (new - old) / new, 3)
|
62 |
|
63 |
def change_and_delta(old_old, old, new):
|
64 |
curr_change = change_pct(old, new)
|
65 |
prev_change = change_pct(old_old, old)
|
66 |
delta = round(curr_change-prev_change, 3)
|
67 |
-
delta
|
|
|
68 |
curr_change = f"{curr_change}%"
|
69 |
return curr_change, delta
|
|
|
58 |
return val
|
59 |
|
60 |
def change_pct(old, new):
|
61 |
+
if new == 0:
|
62 |
+
return -10000000
|
63 |
return round(100* (new - old) / new, 3)
|
64 |
|
65 |
def change_and_delta(old_old, old, new):
|
66 |
curr_change = change_pct(old, new)
|
67 |
prev_change = change_pct(old_old, old)
|
68 |
delta = round(curr_change-prev_change, 3)
|
69 |
+
if delta > 0:
|
70 |
+
delta = f"+{delta}%"
|
71 |
curr_change = f"{curr_change}%"
|
72 |
return curr_change, delta
|