Spaces:

open-source-metrics
/

models-explorer

Sleeping

App Files Files Community

osanseviero commited on Oct 28, 2022

Commit

78f7e42

1 Parent(s): 6c21ae3

Release v2

Browse files

Files changed (8) hide show

__pycache__/language.cpython-38.pyc +0 -0
__pycache__/pipelines.cpython-38.pyc +0 -0
__pycache__/utils.cpython-38.pyc +0 -0
changelog.md +16 -1
language.py +52 -0
models.py +227 -214
pipelines.py +45 -0
utils.py +4 -1

__pycache__/language.cpython-38.pyc ADDED Viewed

Binary file (1.6 kB). View file

__pycache__/pipelines.cpython-38.pyc ADDED Viewed

Binary file (1.5 kB). View file

__pycache__/utils.cpython-38.pyc ADDED Viewed

Binary file (2.59 kB). View file

changelog.md CHANGED Viewed

@@ -1,11 +1,26 @@
 Changelog
 v0.2 - Oct 24
 - Languages
     - Allow filtering for modality
-    - Show new languages for the diff
     - Show rate of change in languages
     - Also include multilingual tag as multilingual for model selection in languages
 v0.1
 - Allow pick comparison version

 Changelog
+Planned
+- Allow filtering just for the new models (no way to get this atm)
 v0.2 - Oct 24
 - Languages
     - Allow filtering for modality
+    - Show new and removed languages for the diff
     - Show rate of change in languages
     - Also include multilingual tag as multilingual for model selection in languages
+    - Spotted bug: False as a row in the dataset. To look into it
+- License
+    - Add rate of change for top metrics
+    - Show lost and new licenses
+- Pipelines
+    - Add rate of change for all metrics
+    - Fix bug that did not show new tags
+    - Add info per modality
+    - See new tags
+    - Pipeline breakdown by modality
+- Discussions and Libraries
+    - Add rate of change for metrics
 v0.1
 - Allow pick comparison version

language.py CHANGED Viewed

	@@ -0,0 +1,52 @@

+from ast import literal_eval
+def make_lang_list(row):
+    languages = row["languages"]
+    if languages == "none":
+        return []
+    return literal_eval(languages)
+def language_count(row):
+    return len(row["languages"])
+def process_for_lang(data, modality):
+    # Filter by modality
+    if modality == "NLP":
+        data = data[data["modality"] == "nlp"]
+    elif modality == "Audio":
+        data = data[data["modality"] == "audio"]
+    elif modality == "Multimodal":
+        data = data[data["modality"] == "multimodal"]
+    # Remove rows without languages
+    data.loc[data.languages == "False", 'languages'] = None
+    data.loc[data.languages == {}, 'languages'] = None
+    # Count of rows that have no languages
+    no_lang_count = data["languages"].isna().sum()
+    # As the languages column might have multiple languages,
+    # we need to convert it to a list. We then count the number of languages.
+    data["languages"] = data["languages"].fillna('none')
+    data["languages"] = data.apply(make_lang_list, axis=1)
+    data["language_count"] = data.apply(language_count, axis=1)
+    # Just keep the models with at least one language
+    models_with_langs = data[data["language_count"] > 0]
+    langs = models_with_langs["languages"].explode()
+    langs = langs[langs != {}]
+    total_langs = len(langs.unique())
+    data['multilingual'] = data.apply(lambda x: int("multilingual" in x['languages']), axis=1)
+    return data, no_lang_count, total_langs, langs.unique()
+def filter_multilinguality(data, linguality):
+    if linguality == "Just Multilingual":
+        multilingual_tag = data["multilingual"] == 1
+        multiple_lang_tags = data["language_count"] > 1
+        return data[multilingual_tag | multiple_lang_tags]
+    elif linguality == "Three or more languages":
+        return data[data["language_count"] >= 3]
+    else:
+        return data

models.py CHANGED Viewed

@@ -4,7 +4,9 @@ from ast import literal_eval
 import altair as alt
 import matplotlib.pyplot as plt
-from utils import process_dataset, eval_tags
 def main():
     # Pick revision at top
@@ -26,16 +28,6 @@ def main():
             supported_revisions,
             index=2)
-    def change_pct(old, new):
-        return round(100* (new - old) / new, 3)
-    def change_and_delta(old_old, old, new):
-        curr_change = change_pct(old, new)
-        prev_change = change_pct(old_old, old)
-        delta = f"{curr_change-prev_change}%"
-        curr_change = f"{curr_change}%"
-        return curr_change, delta
     # Process dataset
     old_old_data = process_dataset(base_old)
     old_data = process_dataset(base)
@@ -63,44 +55,11 @@ def main():
     tab = st.selectbox(
             'Topic of interest',
-            ["Language", "License", "Pipeline", "Discussion Features", "Libraries", "Model Cards", "Super Users", "Raw Data"])
     if tab == "Language":
         st.header("Languages info")
-        def make_list(row):
-            languages = row["languages"]
-            if languages == "none":
-                return []
-            return literal_eval(languages)
-        def language_count(row):
-            return len(row["languages"])
-        def process_for_lang(data):
-            # Remove rows without languages
-            data.loc[data.languages == "False", 'languages'] = None
-            data.loc[data.languages == {}, 'languages'] = None
-            # Count of rows that have no languages
-            no_lang_count = data["languages"].isna().sum()
-            # As the languages column might have multiple languages,
-            # we need to convert it to a list. We then count the number of languages.
-            data["languages"] = data["languages"].fillna('none')
-            data["languages"] = data.apply(make_list, axis=1)
-            data["language_count"] = data.apply(language_count, axis=1)
-            # Just keep the models with at least one language
-            models_with_langs = data[data["language_count"] > 0]
-            langs = models_with_langs["languages"].explode()
-            langs = langs[langs != {}]
-            total_langs = len(langs.unique())
-            data['multilingual'] = data.apply(lambda x: int("multilingual" in x['languages']), axis=1)
-            return data, no_lang_count, total_langs, langs.unique()
         filtered_data = data.copy()
         old_filtered_data = old_data.copy()
         old_old_filtered_data = old_old_data.copy()
@@ -109,30 +68,13 @@ def main():
             'Modalities',
             ["All", "NLP", "Audio", "Multimodal"])
-        if modality == "NLP":
-            filtered_data = filtered_data[filtered_data["modality"] == "nlp"]
-            old_filtered_data = old_filtered_data[old_filtered_data["modality"] == "nlp"]
-            old_old_filtered_data = old_old_filtered_data[old_old_filtered_data["modality"] == "nlp"]
-        elif modality == "Audio":
-            filtered_data = filtered_data[filtered_data["modality"] == "audio"]
-            old_filtered_data = old_filtered_data[old_filtered_data["modality"] == "audio"]
-            old_old_filtered_data = old_old_filtered_data[old_old_filtered_data["modality"] == "audio"]
-        elif modality == "Multimodal":
-            filtered_data = filtered_data[filtered_data["modality"] == "multimodal"]
-            old_filtered_data = old_filtered_data[old_filtered_data["modality"] == "multimodal"]
-            old_old_filtered_data = old_old_filtered_data[old_old_filtered_data["modality"] == "multimodal"]
-        filtered_data, no_lang_count, total_langs, langs = process_for_lang(filtered_data)
-        old_filtered_data, no_lang_count_old, total_langs_old, langs_old = process_for_lang(old_filtered_data)
-        old_old_filtered_data, no_lang_count_old_old, total_langs_old_old, _ = process_for_lang(old_old_filtered_data)
-        total_samples_filtered = filtered_data.shape[0]
-        total_samples_old_filtered = old_filtered_data.shape[0]
-        total_samples_old_old_filtered = old_old_filtered_data.shape[0]
-        v = total_samples_filtered-no_lang_count
-        v_old = total_samples_old_filtered-no_lang_count_old
-        v_old_old = total_samples_old_old_filtered-no_lang_count_old_old
         col1, col2 = st.columns(2)
         with col1:
@@ -155,6 +97,7 @@ def main():
             curr_change, delta = change_and_delta(total_langs_old_old, total_langs_old, total_langs)
             st.metric(label="Total Unique Languages Rate of Change", value=curr_change, delta=delta)
         st.text(f"New languages {set(langs)-set(langs_old)}")
         st.subheader("Count of languages per model repo")
         st.text("Some repos are for multiple languages, so the count is greater than 1")
@@ -162,19 +105,8 @@ def main():
             'All or just Multilingual',
             ["All", "Just Multilingual", "Three or more languages"])
-        def filter_multilinguality(data):
-            if linguality == "Just Multilingual":
-                multilingual_tag = data["multilingual"] == 1
-                multiple_lang_tags = data["language_count"] > 1
-                return data[multilingual_tag | multiple_lang_tags]
-            elif linguality == "Three or more languages":
-                return data[data["language_count"] >= 3]
-            else:
-                return data
-        models_with_langs = filter_multilinguality(filtered_data)
-        models_with_langs_old = filter_multilinguality(old_filtered_data)
         df1 = models_with_langs['language_count'].value_counts()
         df1_old = models_with_langs_old['language_count'].value_counts()
@@ -185,14 +117,6 @@ def main():
             'All or filtered',
             ["All", "No English", "Remove top 10"])
-        filter = 0
-        if linguality_2 == "All":
-            filter = 0
-        elif linguality_2 == "No English":
-            filter = 1
-        else:
-            filter = 2
         models_with_langs = filtered_data[filtered_data["language_count"] > 0]
         langs = models_with_langs["languages"].explode()
         langs = langs[langs != {}]
@@ -204,9 +128,9 @@ def main():
         langs = langs[langs != {}]
         orig_d_old = langs.value_counts().rename_axis("language").to_frame('counts').reset_index()
-        if filter == 1:
             d = orig_d.iloc[1:]
-        elif filter == 2:
             d = orig_d.iloc[10:]
         # Just keep top 25 to avoid vertical scroll
@@ -231,31 +155,51 @@ def main():
         final_data =  pd.merge(
             d, orig_d_old, how="outer", on="language"
         )
-        print(final_data["counts"].isna().sum())
-        print(final_data["old_c"].isna().sum())
-        final_data["diff"] = final_data["counts"].astype(int) - final_data["old_c"].astype(int)
         st.dataframe(final_data)
     #with tab2:
     if tab == "License":
         st.header("License info")
         no_license_count = data["license"].isna().sum()
         no_license_count_old = old_data["license"].isna().sum()
-        col1, col2, col3 = st.columns(3)
         with col1:
             v = total_samples-no_license_count
             v_old = total_samples_old-no_license_count_old
             st.metric(label="License Specified", value=v, delta=int(v-v_old))
         with col2:
-            st.metric(label="No license Specified", value=no_license_count, delta=int(no_license_count-no_license_count_old))
-        with col3:
-            unique_licenses = len(data["license"].unique())
-            unique_licenses_old = len(old_data["license"].unique())
             st.metric(label="Total Unique Licenses", value=unique_licenses, delta=int(unique_licenses-unique_licenses_old))
         st.subheader("Distribution of licenses per model repo")
         license_filter = st.selectbox(
@@ -306,81 +250,65 @@ def main():
         tags_old = old_data["tags"].explode()
         tags_old = tags_old[tags_old.notna()].value_counts().rename_axis("tag").to_frame('counts').reset_index()
-        s = tags_old["tag"]
-        s = s[s.apply(type) == str]
-        unique_tags_old = len(s.unique())
         no_pipeline_count = data["pipeline"].isna().sum()
         no_pipeline_count_old = old_data["pipeline"].isna().sum()
-        col1, col2, col3 = st.columns(3)
         with col1:
-            v = total_samples-no_pipeline_count
-            v_old = total_samples_old-no_pipeline_count_old
             st.metric(label="# models that have any pipeline", value=v, delta=int(v-v_old))
         with col2:
             st.metric(label="No pipeline Specified", value=no_pipeline_count, delta=int(no_pipeline_count-no_pipeline_count_old))
-        with col3:
             st.metric(label="Total Unique Tags", value=unique_tags, delta=int(unique_tags-unique_tags_old))
-        pipeline_filter = st.selectbox(
             'Modalities',
             ["All", "NLP", "CV", "Audio", "RL", "Multimodal", "Tabular"])
-        filter = 0
-        if pipeline_filter == "All":
-            filter = 0
-        elif pipeline_filter == "NLP":
-            filter = 1
-        elif pipeline_filter == "CV":
-            filter = 2
-        elif pipeline_filter == "Audio":
-            filter = 3
-        elif pipeline_filter == "RL":
-            filter = 4
-        elif pipeline_filter == "Multimodal":
-            filter = 5
-        elif pipeline_filter == "Tabular":
-            filter = 6
         st.subheader("High-level metrics")
-        filtered_data = data[data['pipeline'].notna()]
-        filtered_data_old = old_data[old_data['pipeline'].notna()]
-        if filter == 1:
-            filtered_data = data[data["modality"] == "nlp"]
-            filtered_data_old = old_data[old_data["modality"] == "nlp"]
-        elif filter == 2:
-            filtered_data = data[data["modality"] == "cv"]
-            filtered_data_old = old_data[old_data["modality"] == "cv"]
-        elif filter == 3:
-            filtered_data = data[data["modality"] == "audio"]
-            filtered_data_old = old_data[old_data["modality"] == "audio"]
-        elif filter == 4:
-            filtered_data = data[data["modality"] == "rl"]
-            filtered_data_old = old_data[old_data["modality"] == "rl"]
-        elif filter == 5:
-            filtered_data = data[data["modality"] == "multimodal"]
-            filtered_data_old = old_data[old_data["modality"] == "multimodal"]
-        elif filter == 6:
-            filtered_data = data[data["modality"] == "tabular"]
-            filtered_data_old = old_data[old_data["modality"] == "tabular"]
         col1, col2, col3 = st.columns(3)
         with col1:
             p = st.selectbox(
                 'What pipeline do you want to see?',
-                ["all", *filtered_data["pipeline"].unique()]
             )
         with col2:
             l = st.selectbox(
                 'What library do you want to see?',
-                ["all", "not transformers", *filtered_data["library"].unique()]
             )
         with col3:
             f = st.selectbox(
-                'What framework support? (transformers)',
-                ["all", "py", "tf", "jax"]
             )
         col1, col2 = st.columns(2)
@@ -393,49 +321,13 @@ def main():
             o = st.selectbox(
                 label="Operation (for tags)",
                 options=["Any", "All", "None"]
-            )
-        def filter_fn(row):
-            tags = row["tags"]
-            tags[:] = [d for d in tags if isinstance(d, str)]
-            if o == "All":
-                if all(elem in tags for elem in filt):
-                    return True
-            s1 = set(tags)
-            s2 = set(filt)
-            if o == "Any":
-                if bool(s1 & s2):
-                    return True
-            if o == "None":
-                if len(s1.intersection(s2)) == 0:
-                    return True
-            return False
-        if p != "all":
-            filtered_data = filtered_data[filtered_data["pipeline"] == p]
-            filtered_data_old = filtered_data_old[filtered_data_old["pipeline"] == p]
-        if l != "all" and l != "not transformers":
-            filtered_data = filtered_data[filtered_data["library"] == l]
-            filtered_data_old = filtered_data_old[filtered_data_old["library"] == l]
-        if l == "not transformers":
-            filtered_data = filtered_data[filtered_data["library"] != "transformers"]
-            filtered_data_old = filtered_data_old[filtered_data_old["library"] != "transformers"]
-        if f != "all":
-            if f == "py":
-                filtered_data = filtered_data[filtered_data["pytorch"] == 1]
-                filtered_data_old = filtered_data_old[filtered_data_old["pytorch"] == 1]
-            elif f == "tf":
-                filtered_data = filtered_data[filtered_data["tensorflow"] == 1]
-                filtered_data_old = filtered_data_old[filtered_data_old["tensorflow"] == 1]
-            elif f == "jax":
-                filtered_data = filtered_data[filtered_data["jax"] == 1]
-                filtered_data_old = filtered_data_old[filtered_data_old["jax"] == 1]
-        if filt != []:
-            filtered_data = filtered_data[filtered_data.apply(filter_fn, axis=1)]
-            filtered_data_old = filtered_data_old[filtered_data_old.apply(filter_fn, axis=1)]
         d = filtered_data["pipeline"].value_counts().rename_axis("pipeline").to_frame('counts').reset_index()
         columns_of_interest = ["downloads_30d", "likes", "pytorch", "tensorflow", "jax"]
@@ -443,23 +335,45 @@ def main():
         final_data = pd.merge(
             d, grouped_data, how="outer", on="pipeline"
         )
-        sums = grouped_data.sum()
         d_old = filtered_data_old["pipeline"].value_counts().rename_axis("pipeline").to_frame('counts').reset_index()
         grouped_data_old = filtered_data_old.groupby("pipeline").sum()[columns_of_interest]
         final_data_old = pd.merge(
             d_old, grouped_data_old, how="outer", on="pipeline"
         )
         sums = grouped_data.sum()
         sums_old = grouped_data_old.sum()
-        col1, col2, col3 = st.columns(3)
         with col1:
-            st.metric(label="Total models", value=filtered_data.shape[0], delta=int(filtered_data.shape[0] - filtered_data_old.shape[0]))
         with col2:
-            st.metric(label="Cumulative Downloads (30d)", value=sums["downloads_30d"], delta=int(sums["downloads_30d"] - sums_old["downloads_30d"]))
         with col3:
             st.metric(label="Cumulative likes", value=sums["likes"], delta=int(sums["likes"] - sums_old["likes"]))
         col1, col2, col3 = st.columns(3)
         with col1:
@@ -469,9 +383,41 @@ def main():
         with col3:
             st.metric(label="Total in JAX", value=sums["jax"], delta=int(sums["jax"] - sums_old["jax"]))
-        st.metric(label="Unique Tags", value=unique_tags, delta=int(unique_tags - unique_tags_old))
         st.subheader("Count of models per pipeline")
         st.write(alt.Chart(d).mark_bar().encode(
@@ -511,8 +457,6 @@ def main():
             "downloads_30d", "likes", "pytorch", "tensorflow", "jax"]
         raw_data = filtered_data[columns_of_interest]
         st.dataframe(raw_data)
         # todo : add activity metric
@@ -524,6 +468,7 @@ def main():
         columns_of_interest = ["prs_count", "prs_open", "prs_merged", "prs_closed", "discussions_count", "discussions_open", "discussions_closed"]
         sums = data[columns_of_interest].sum()
         sums_old = old_data[columns_of_interest].sum()
         col1, col2, col3, col4 = st.columns(4)
         with col1:
@@ -535,6 +480,20 @@ def main():
         with col4:
             st.metric(label="PRs closed", value=sums["prs_closed"], delta=int(sums["prs_closed"] - sums_old["prs_closed"]))
         col1, col2, col3 = st.columns(3)
         with col1:
             st.metric(label="Total discussions", value=sums["discussions_count"], delta=int(sums["discussions_count"] - sums_old["discussions_count"]))
@@ -543,6 +502,17 @@ def main():
         with col3:
             st.metric(label="Discussions closed", value=sums["discussions_closed"], delta=int(sums["discussions_closed"] - sums_old["discussions_closed"]))
         filtered_data = data[["repo_id", "prs_count", "prs_open", "prs_merged", "prs_closed", "discussions_count", "discussions_open", "discussions_closed"]].sort_values("prs_count", ascending=False).reset_index(drop=True)
         st.dataframe(filtered_data)
@@ -552,6 +522,7 @@ def main():
         no_library_count = data["library"].isna().sum()
         no_library_count_old = old_data["library"].isna().sum()
         col1, col2, col3 = st.columns(3)
         with col1:
             v = total_samples-no_library_count
@@ -564,6 +535,22 @@ def main():
             v_old = len(old_data["library"].unique())
             st.metric(label="Total Unique library", value=v, delta=int(v-v_old))
         st.subheader("High-level metrics")
         filtered_data = data[data['library'].notna()]
@@ -623,8 +610,6 @@ def main():
             y=alt.X('library', sort=None)
         ))
         st.subheader("Aggregated Data")
         final_data =  pd.merge(
             final_data, final_data_old, how="outer", on="library"
@@ -647,6 +632,7 @@ def main():
         columns_of_interest = ["has_model_index", "has_metadata", "has_text", "text_length"]
         rows = data.shape[0]
         rows_old = old_data.shape[0]
         cond = data["has_model_index"] | data["has_text"]
         with_model_card = data[cond]
@@ -656,31 +642,58 @@ def main():
         with_model_card_old = old_data[cond]
         c_model_card_old = with_model_card_old.shape[0]
         st.subheader("High-level metrics")
-        col1, col2, col3 = st.columns(3)
         with col1:
-            st.metric(label="# models with model card file", value=c_model_card, delta=int(c_model_card-c_model_card_old))
         with col2:
-            st.metric(label="# models without model card file", value=rows-c_model_card, delta=int((rows-c_model_card)-(rows_old-c_model_card_old)))
         with_index = data["has_model_index"].sum()
         with_index_old = old_data["has_model_index"].sum()
         with col1:
-            st.metric(label="# models with model index", value=with_index, delta=int(with_index-with_index_old))
         with col2:
-            st.metric(label="# models without model index", value=rows-with_index, delta=int((rows-with_index)-(rows_old-with_index_old)))
         with_text = data["has_text"]
         with_text_old = old_data["has_text"]
         with col1:
-            st.metric(label="# models with model card text", value=with_text.sum(), delta=int(with_text.sum()-with_text_old.sum()))
         with col2:
-            st.metric(label="# models without model card text", value=rows-with_text.sum(), delta=int((rows-with_text.sum())-(rows_old-with_text_old.sum())))
         st.subheader("Length (chars) of model card content")
-        fig, ax = plt.subplots()
-        ax = data["length_bins"].value_counts().plot.bar()
         st.metric(label="# average length of model card (chars)", value=data[with_text]["text_length"].mean())
         st.pyplot(fig)

 import altair as alt
 import matplotlib.pyplot as plt
+from utils import process_dataset, eval_tags, change_and_delta
+from language import process_for_lang, filter_multilinguality
+from pipelines import filter_pipeline_data
 def main():
     # Pick revision at top
             supported_revisions,
             index=2)
     # Process dataset
     old_old_data = process_dataset(base_old)
     old_data = process_dataset(base)
     tab = st.selectbox(
             'Topic of interest',
+            ["Language","License",  "Pipeline", "Discussion Features", "Libraries", "Model Cards", "Super Users", "Raw Data"])
     if tab == "Language":
         st.header("Languages info")
         filtered_data = data.copy()
         old_filtered_data = old_data.copy()
         old_old_filtered_data = old_old_data.copy()
             'Modalities',
             ["All", "NLP", "Audio", "Multimodal"])
+        filtered_data, no_lang_count, total_langs, langs = process_for_lang(filtered_data, modality)
+        old_filtered_data, no_lang_count_old, total_langs_old, langs_old = process_for_lang(old_filtered_data, modality)
+        old_old_filtered_data, no_lang_count_old_old, total_langs_old_old, _ = process_for_lang(old_old_filtered_data, modality)
+        v = filtered_data.shape[0]-no_lang_count
+        v_old = old_filtered_data.shape[0]-no_lang_count_old
+        v_old_old = old_old_filtered_data.shape[0]-no_lang_count_old_old
         col1, col2 = st.columns(2)
         with col1:
             curr_change, delta = change_and_delta(total_langs_old_old, total_langs_old, total_langs)
             st.metric(label="Total Unique Languages Rate of Change", value=curr_change, delta=delta)
         st.text(f"New languages {set(langs)-set(langs_old)}")
+        st.text(f"Lost languages {set(langs_old)-set(langs)}")
         st.subheader("Count of languages per model repo")
         st.text("Some repos are for multiple languages, so the count is greater than 1")
             'All or just Multilingual',
             ["All", "Just Multilingual", "Three or more languages"])
+        models_with_langs = filter_multilinguality(filtered_data, linguality)
+        models_with_langs_old = filter_multilinguality(old_filtered_data, linguality)
         df1 = models_with_langs['language_count'].value_counts()
         df1_old = models_with_langs_old['language_count'].value_counts()
             'All or filtered',
             ["All", "No English", "Remove top 10"])
         models_with_langs = filtered_data[filtered_data["language_count"] > 0]
         langs = models_with_langs["languages"].explode()
         langs = langs[langs != {}]
         langs = langs[langs != {}]
         orig_d_old = langs.value_counts().rename_axis("language").to_frame('counts').reset_index()
+        if linguality_2 == "No English":
             d = orig_d.iloc[1:]
+        elif linguality_2 == "Remove top 10":
             d = orig_d.iloc[10:]
         # Just keep top 25 to avoid vertical scroll
         final_data =  pd.merge(
             d, orig_d_old, how="outer", on="language"
         )
+        final_data['counts'] = final_data['counts'].fillna(0).astype(int)
+        final_data['old_c'] = final_data['old_c'].fillna(0).astype(int)
+        final_data["diff"] = final_data["counts"] - final_data["old_c"]
+        final_data['language'] = final_data['language'].astype(str)
         st.dataframe(final_data)
     #with tab2:
     if tab == "License":
         st.header("License info")
         no_license_count = data["license"].isna().sum()
         no_license_count_old = old_data["license"].isna().sum()
+        no_license_count_old_old = old_old_data["license"].isna().sum()
+        col1, col2 = st.columns(2)
         with col1:
             v = total_samples-no_license_count
             v_old = total_samples_old-no_license_count_old
             st.metric(label="License Specified", value=v, delta=int(v-v_old))
         with col2:
+            v = total_samples-no_license_count
+            v_old = total_samples_old-no_license_count_old
+            v_old_old = total_samples_old-no_license_count_old_old
+            curr_change, delta = change_and_delta(v_old_old, v_old, v)
+            st.metric(label="License Specified Rate of Change", value=curr_change, delta=delta)
+        col1, col2 = st.columns(2)
+        with col1:
+            st.metric(label="No License Specified", value=no_license_count, delta=int(no_license_count-no_license_count_old))
+        with col2:
+            curr_change, delta = change_and_delta(no_license_count_old_old, no_license_count_old, no_license_count)
+            st.metric(label="No License Specified Rate of Change", value=curr_change, delta=delta)
+        col1, col2 = st.columns(2)
+        unique_licenses = len(data["license"].unique())
+        unique_licenses_old = len(old_data["license"].unique())
+        unique_licenses_old_old = len(old_old_data["license"].unique())
+        with col1:
             st.metric(label="Total Unique Licenses", value=unique_licenses, delta=int(unique_licenses-unique_licenses_old))
+        with col2:
+            curr_change, delta = change_and_delta(unique_licenses_old_old, unique_licenses_old, unique_licenses)
+            st.metric(label="Total Unique Licenses Rate of Change", value=curr_change, delta=delta)
+        st.text(f"New licenses {set(data['license'].unique())-set(old_data['license'].unique())}")
+        st.text(f"Old licenses {set(old_data['license'].unique())-set(data['license'].unique())}")
         st.subheader("Distribution of licenses per model repo")
         license_filter = st.selectbox(
         tags_old = old_data["tags"].explode()
         tags_old = tags_old[tags_old.notna()].value_counts().rename_axis("tag").to_frame('counts').reset_index()
+        s_o = tags_old["tag"]
+        s_o = s_o[s_o.apply(type) == str]
+        unique_tags_old = len(s_o.unique())
+        tags_old_old = old_old_data["tags"].explode()
+        tags_old_old = tags_old_old[tags_old_old.notna()].value_counts().rename_axis("tag").to_frame('counts').reset_index()
+        s_old_old = tags_old_old["tag"]
+        s_old_old = s_old_old[s_old_old.apply(type) == str]
+        unique_tags_old_old = len(s_old_old.unique())
         no_pipeline_count = data["pipeline"].isna().sum()
         no_pipeline_count_old = old_data["pipeline"].isna().sum()
+        no_pipeline_count_old_old = old_old_data["pipeline"].isna().sum()
+        col1, col2 = st.columns(2)
+        v = total_samples-no_pipeline_count
+        v_old = total_samples_old-no_pipeline_count_old
+        v_old_old = total_samples_old_old-no_pipeline_count_old_old
         with col1:
             st.metric(label="# models that have any pipeline", value=v, delta=int(v-v_old))
         with col2:
+            curr_change, delta = change_and_delta(v_old_old, v_old, v)
+            st.metric(label="# models rate of change", value=curr_change, delta=delta)
+        col1, col2 = st.columns(2)
+        with col1:
             st.metric(label="No pipeline Specified", value=no_pipeline_count, delta=int(no_pipeline_count-no_pipeline_count_old))
+        with col2:
+            curr_change, delta = change_and_delta(no_pipeline_count_old_old, no_pipeline_count_old, no_pipeline_count)
+            st.metric(label="No pipeline Specified rate of change", value=curr_change, delta=delta)
+        col1, col2 = st.columns(2)
+        with col1:
             st.metric(label="Total Unique Tags", value=unique_tags, delta=int(unique_tags-unique_tags_old))
+        with col2:
+            curr_change, delta = change_and_delta(unique_tags_old_old, unique_tags_old, unique_tags)
+            st.metric(label="Total Unique Tags", value=curr_change, delta=delta)
+        modality_filter = st.selectbox(
             'Modalities',
             ["All", "NLP", "CV", "Audio", "RL", "Multimodal", "Tabular"])
         st.subheader("High-level metrics")
         col1, col2, col3 = st.columns(3)
         with col1:
             p = st.selectbox(
                 'What pipeline do you want to see?',
+                ["all", *data["pipeline"].unique()]
             )
         with col2:
             l = st.selectbox(
                 'What library do you want to see?',
+                ["all", "not transformers", *data["library"].unique()]
             )
         with col3:
             f = st.selectbox(
+                'What trf framework support?',
+                ["all", "pytorch", "tensorflow", "jax"]
             )
         col1, col2 = st.columns(2)
             o = st.selectbox(
                 label="Operation (for tags)",
                 options=["Any", "All", "None"]
+            )
+        filtered_data, tags = filter_pipeline_data(data, modality_filter, p, l, f, filt, o)
+        filtered_data_old, old_tags = filter_pipeline_data(old_data, modality_filter, p, l, f, filt, o)
+        filtered_data_old_old, old_old_tags = filter_pipeline_data(old_old_data, modality_filter, p, l, f, filt, o)
+        st.subheader("Pipeline breakdown")
         d = filtered_data["pipeline"].value_counts().rename_axis("pipeline").to_frame('counts').reset_index()
         columns_of_interest = ["downloads_30d", "likes", "pytorch", "tensorflow", "jax"]
         final_data = pd.merge(
             d, grouped_data, how="outer", on="pipeline"
         )
         d_old = filtered_data_old["pipeline"].value_counts().rename_axis("pipeline").to_frame('counts').reset_index()
         grouped_data_old = filtered_data_old.groupby("pipeline").sum()[columns_of_interest]
         final_data_old = pd.merge(
             d_old, grouped_data_old, how="outer", on="pipeline"
         )
+        d_old = filtered_data_old_old["pipeline"].value_counts().rename_axis("pipeline").to_frame('counts').reset_index()
+        grouped_data_old_old = filtered_data_old_old.groupby("pipeline").sum()[columns_of_interest]
         sums = grouped_data.sum()
         sums_old = grouped_data_old.sum()
+        sums_old_old = grouped_data_old_old.sum()
+        col1, col2, col3, col4 = st.columns(4)
+        v = filtered_data.shape[0]
+        v_old = filtered_data_old.shape[0]
+        v_old_old = filtered_data_old_old.shape[0]
         with col1:
+            st.metric(label="Total models", value=v, delta=int(v - v_old))
         with col2:
+            curr_change, delta = change_and_delta(v_old_old, v_old, v)
+            st.metric(label="Total models rate of change", value=curr_change, delta=delta)
         with col3:
+            st.metric(label="Cumulative Downloads (30d)", value=sums["downloads_30d"], delta=int(sums["downloads_30d"] - sums_old["downloads_30d"]))
+        with col4:
+            print(sums_old_old["downloads_30d"], sums_old["downloads_30d"], sums["downloads_30d"])
+            curr_change, delta = change_and_delta(sums_old_old["downloads_30d"], sums_old["downloads_30d"], sums["downloads_30d"])
+            st.metric(label="Cumulative Downloads (30d) rate of change", value=curr_change, delta=delta)
+        col1, col2, col3 = st.columns(3)
+        with col1:
+            st.metric(label="Total unique pipelines", value=len(filtered_data["pipeline"].unique()))
+        with col2:
             st.metric(label="Cumulative likes", value=sums["likes"], delta=int(sums["likes"] - sums_old["likes"]))
+        with col3:
+            curr_change, delta = change_and_delta(sums_old_old["likes"], sums_old["likes"], sums["likes"])
+            st.metric(label="Cumulative Likes rate of change", value=curr_change, delta=delta)
         col1, col2, col3 = st.columns(3)
         with col1:
         with col3:
             st.metric(label="Total in JAX", value=sums["jax"], delta=int(sums["jax"] - sums_old["jax"]))
+        col1, col2 = st.columns(2)
+        with col1:
+            st.metric(label="Total unique libraries", value=len(filtered_data["library"].unique()))
+        with col2:
+            st.metric(label="Total unique modality", value=len(filtered_data["modality"].unique()))
+        col1, col2 = st.columns(2)
+        with col1:
+            st.metric(label="Total transformers models", value=len(filtered_data[filtered_data["library"] == "transformers"]))
+        with col2:
+            st.metric(label="Total non transformers models", value=len(filtered_data[filtered_data["library"] != "transformers"]))
+        st.metric(label="Unique Tags", value=len(tags), delta=int(len(tags) - len(old_tags)))
+        st.text(f"New tags {set(tags)-set(old_tags)}")
+        st.text(f"Lost tags {set(old_tags)-set(tags)}")
+        st.subheader("Pipeline breakdown by modality")
+        col1, col2 = st.columns(2)
+        with col1:
+            st.metric(label="Total CV models", value=len(filtered_data[filtered_data["modality"] == "cv"]))
+        with col2:
+            st.metric(label="Total NLP models", value=len(filtered_data[filtered_data["modality"] == "nlp"]))
+        col1, col2 = st.columns(2)
+        with col1:
+            st.metric(label="Total Audio models", value=len(filtered_data[filtered_data["modality"] == "audio"]))
+        with col2:
+            st.metric(label="Total RL models", value=len(filtered_data[filtered_data["modality"] == "rl"]))
+        col1, col2 = st.columns(2)
+        with col1:
+            st.metric(label="Total Tabular models", value=len(filtered_data[filtered_data["modality"] == "tabular"]))
+        with col2:
+            st.metric(label="Total Multimodal models", value=len(filtered_data[filtered_data["modality"] == "multimodal"]))
         st.subheader("Count of models per pipeline")
         st.write(alt.Chart(d).mark_bar().encode(
             "downloads_30d", "likes", "pytorch", "tensorflow", "jax"]
         raw_data = filtered_data[columns_of_interest]
         st.dataframe(raw_data)
         # todo : add activity metric
         columns_of_interest = ["prs_count", "prs_open", "prs_merged", "prs_closed", "discussions_count", "discussions_open", "discussions_closed"]
         sums = data[columns_of_interest].sum()
         sums_old = old_data[columns_of_interest].sum()
+        sums_old_old = old_old_data[columns_of_interest].sum()
         col1, col2, col3, col4 = st.columns(4)
         with col1:
         with col4:
             st.metric(label="PRs closed", value=sums["prs_closed"], delta=int(sums["prs_closed"] - sums_old["prs_closed"]))
+        col1, col2, col3, col4 = st.columns(4)
+        with col1:
+            curr_change, delta = change_and_delta(sums_old_old["prs_count"], sums_old["prs_count"], sums["prs_count"])
+            st.metric(label="Total PRs change", value=curr_change,delta=delta)
+        with col2:
+            curr_change, delta = change_and_delta(sums_old_old["prs_open"], sums_old["prs_open"], sums["prs_open"])
+            st.metric(label="PRs opened change", value=curr_change,delta=delta)
+        with col3:
+            curr_change, delta = change_and_delta(sums_old_old["prs_merged"], sums_old["prs_merged"], sums["prs_merged"])
+            st.metric(label="PRs merged change", value=curr_change,delta=delta)
+        with col4:
+            curr_change, delta = change_and_delta(sums_old_old["prs_closed"], sums_old["prs_closed"], sums["prs_closed"])
+            st.metric(label="PRs closed change", value=curr_change,delta=delta)
         col1, col2, col3 = st.columns(3)
         with col1:
             st.metric(label="Total discussions", value=sums["discussions_count"], delta=int(sums["discussions_count"] - sums_old["discussions_count"]))
         with col3:
             st.metric(label="Discussions closed", value=sums["discussions_closed"], delta=int(sums["discussions_closed"] - sums_old["discussions_closed"]))
+        col1, col2, col3 = st.columns(3)
+        with col1:
+            curr_change, delta = change_and_delta(sums_old_old["discussions_count"], sums_old["discussions_count"], sums["discussions_count"])
+            st.metric(label="Total discussions change", value=curr_change,delta=delta)
+        with col2:
+            curr_change, delta = change_and_delta(sums_old_old["discussions_open"], sums_old["discussions_open"], sums["discussions_open"])
+            st.metric(label="Discussions open change", value=curr_change,delta=delta)
+        with col3:
+            curr_change, delta = change_and_delta(sums_old_old["discussions_closed"], sums_old["discussions_closed"], sums["discussions_closed"])
+            st.metric(label="Discussions closed change", value=curr_change,delta=delta)
         filtered_data = data[["repo_id", "prs_count", "prs_open", "prs_merged", "prs_closed", "discussions_count", "discussions_open", "discussions_closed"]].sort_values("prs_count", ascending=False).reset_index(drop=True)
         st.dataframe(filtered_data)
         no_library_count = data["library"].isna().sum()
         no_library_count_old = old_data["library"].isna().sum()
+        no_library_count_old_old = old_old_data["library"].isna().sum()
         col1, col2, col3 = st.columns(3)
         with col1:
             v = total_samples-no_library_count
             v_old = len(old_data["library"].unique())
             st.metric(label="Total Unique library", value=v, delta=int(v-v_old))
+        col1, col2, col3 = st.columns(3)
+        with col1:
+            v = total_samples-no_library_count
+            v_old = total_samples_old-no_library_count_old
+            v_old_old = total_samples_old_old-no_library_count_old_old
+            curr_change, delta = change_and_delta(v_old_old, v_old, v)
+            st.metric(label="# models that have any library change", value=curr_change, delta=delta)
+        with col2:
+            curr_change, delta = change_and_delta(no_library_count_old_old, no_library_count_old, no_library_count)
+            st.metric(label="No library Specified Change", value=curr_change, delta=delta)
+        with col3:
+            v = len(data["library"].unique())
+            v_old = len(old_data["library"].unique())
+            v_old_old = len(old_old_data["library"].unique())
+            curr_change, delta = change_and_delta(v_old_old, v_old, v)
+            st.metric(label="Total Unique library", value=curr_change, delta=delta)
         st.subheader("High-level metrics")
         filtered_data = data[data['library'].notna()]
             y=alt.X('library', sort=None)
         ))
         st.subheader("Aggregated Data")
         final_data =  pd.merge(
             final_data, final_data_old, how="outer", on="library"
         columns_of_interest = ["has_model_index", "has_metadata", "has_text", "text_length"]
         rows = data.shape[0]
         rows_old = old_data.shape[0]
+        rows_old_old = old_old_data.shape[0]
         cond = data["has_model_index"] | data["has_text"]
         with_model_card = data[cond]
         with_model_card_old = old_data[cond]
         c_model_card_old = with_model_card_old.shape[0]
+        cond = old_old_data["has_model_index"] | old_old_data["has_text"]
+        with_model_card_old_old = old_old_data[cond]
+        c_model_card_old_old = with_model_card_old_old.shape[0]
         st.subheader("High-level metrics")
+        col1, col2, col3, col4 = st.columns(4)
         with col1:
+            st.metric(label="# with model card file", value=c_model_card, delta=int(c_model_card-c_model_card_old))
         with col2:
+            curr_change, delta = change_and_delta(c_model_card_old_old, c_model_card_old, c_model_card)
+            st.metric(label="# with model card file change", value=curr_change, delta=delta)
+        with col3:
+            st.metric(label="# without model card file", value=rows-c_model_card, delta=int((rows-c_model_card)-(rows_old-c_model_card_old)))
+        with col4:
+            curr_change, delta = change_and_delta(rows_old_old-c_model_card_old_old, rows_old-c_model_card_old, rows-c_model_card)
+            st.metric(label="# without model card file change", value=curr_change, delta=delta)
         with_index = data["has_model_index"].sum()
         with_index_old = old_data["has_model_index"].sum()
+        with_index_old_old = old_old_data["has_model_index"].sum()
         with col1:
+            st.metric(label="# with model index", value=with_index, delta=int(with_index-with_index_old))
         with col2:
+            curr_change, delta = change_and_delta(with_index_old_old, with_index_old, with_index)
+            st.metric(label="# with model index change", value=curr_change, delta=delta)
+        with col3:
+            st.metric(label="# without model index", value=rows-with_index, delta=int((rows-with_index)-(rows_old-with_index_old)))
+        with col4:
+            curr_change, delta = change_and_delta(rows_old_old-with_index_old_old, rows_old-with_index_old, rows-with_index)
+            st.metric(label="# without model index change",  value=curr_change, delta=delta)
         with_text = data["has_text"]
         with_text_old = old_data["has_text"]
+        with_text_old_old = old_old_data["has_text"]
+        with_text_sum = with_text.sum()
+        with_text_old_sum = with_text_old.sum()
+        with_text_old_old_sum = with_text_old_old.sum()
         with col1:
+            st.metric(label="# with model card text", value=with_text_sum, delta=int(with_text_sum-with_text_old_sum))
         with col2:
+            curr_change, delta = change_and_delta(with_text_old_old_sum, with_text_old_sum, with_text_sum)
+            st.metric(label="# with model card text change", value=curr_change, delta=delta)
+        with col3:
+            st.metric(label="# without card text", value=rows-with_text_sum, delta=int((rows-with_text_sum)-(with_text_old_sum)))
+        with col4:
+            curr_change, delta = change_and_delta(rows_old_old-with_text_old_old_sum, rows_old-with_text_old_sum, rows-with_text_sum)
+            st.metric(label="# without card text change", value=curr_change, delta=delta)
         st.subheader("Length (chars) of model card content")
+        fig, _ = plt.subplots()
+        _ = data["length_bins"].value_counts().plot.bar()
         st.metric(label="# average length of model card (chars)", value=data[with_text]["text_length"].mean())
         st.pyplot(fig)

pipelines.py ADDED Viewed

	@@ -0,0 +1,45 @@

+def filter_tags(row, filt, operator):
+    tags = row["tags"]
+    tags[:] = [d for d in tags if isinstance(d, str)]
+    if operator == "All":
+        if all(elem in tags for elem in filt):
+            return True
+    s1 = set(tags)
+    s2 = set(filt)
+    if operator == "Any":
+        if bool(s1 & s2):
+            return True
+    if operator == "None":
+        if len(s1.intersection(s2)) == 0:
+            return True
+    return False
+def filter_pipeline_data(data, modality, pipeline, library,framework, tags, operator):
+    data = data[data['pipeline'].notna()]
+    if modality != "All":
+        data = data[data["modality"] == modality.lower()]
+    if pipeline != "all":
+        data = data[data["pipeline"] == pipeline]
+    if library != "all" and library != "not transformers":
+        data = data[data["library"] == library]
+    if library == "not transformers":
+        data = data[data["library"] != "transformers"]
+    if framework != "all":
+        print(framework)
+        data = data[data[framework] == 1]
+    if tags != []:
+        data = data[data.apply(filter_tags, axis=1, filt=tags, operator=operator)]
+    tags = data["tags"].explode()
+    tags = tags[tags.notna()].value_counts().rename_axis("tag").to_frame('counts').reset_index()
+    s = tags["tag"]
+    s = s[s.apply(type) == str]
+    return data, s.unique()

utils.py CHANGED Viewed

@@ -58,12 +58,15 @@ def eval_tags(row):
     return val
 def change_pct(old, new):
     return round(100* (new - old) / new, 3)
 def change_and_delta(old_old, old, new):
     curr_change = change_pct(old, new)
     prev_change = change_pct(old_old, old)
     delta = round(curr_change-prev_change, 3)
-    delta = f"{delta}%"
     curr_change = f"{curr_change}%"
     return curr_change, delta

     return val
 def change_pct(old, new):
+    if new == 0:
+        return -10000000
     return round(100* (new - old) / new, 3)
 def change_and_delta(old_old, old, new):
     curr_change = change_pct(old, new)
     prev_change = change_pct(old_old, old)
     delta = round(curr_change-prev_change, 3)
+    if delta > 0:
+        delta = f"+{delta}%"
     curr_change = f"{curr_change}%"
     return curr_change, delta