osanseviero commited on
Commit
6c21ae3
1 Parent(s): 33c8677

Add languages

Browse files
Files changed (4) hide show
  1. changelog.md +7 -0
  2. language.py +0 -0
  3. models.py +126 -105
  4. utils.py +69 -0
changelog.md CHANGED
@@ -1,5 +1,12 @@
1
  Changelog
2
 
 
 
 
 
 
 
 
3
  v0.1
4
  - Allow pick comparison version
5
  - Show delta in all metrics
 
1
  Changelog
2
 
3
+ v0.2 - Oct 24
4
+ - Languages
5
+ - Allow filtering for modality
6
+ - Show new languages for the diff
7
+ - Show rate of change in languages
8
+ - Also include multilingual tag as multilingual for model selection in languages
9
+
10
  v0.1
11
  - Allow pick comparison version
12
  - Show delta in all metrics
language.py ADDED
File without changes
models.py CHANGED
@@ -1,89 +1,62 @@
1
  import streamlit as st
2
  import pandas as pd
3
- from datasets import load_dataset
4
  from ast import literal_eval
5
  import altair as alt
6
- import plotly.graph_objs as go
7
  import matplotlib.pyplot as plt
8
 
9
- def main():
10
- print("Build")
11
- nlp_tasks = ["text-classification", "text-generation", "text2text-generation", "token-classification", "fill-mask", "question-answering",
12
- "translation", "conversational", "sentence-similarity", "summarization", "multiple-choice", "zero-shot-classification", "table-question-answering"
13
- ]
14
- audio_tasks = ["automatic-speech-recognition", "audio-classification", "text-to-speech", "audio-to-audio", "voice-activity-detection"]
15
- cv_tasks = ["image-classification", "image-segmentation", "zero-shot-image-classification", "image-to-image", "unconditional-image-generation", "object-detection"]
16
- multimodal = ["feature-extraction", "text-to-image", "visual-question-answering", "image-to-text", "document-question-answering"]
17
- tabular = ["tabular-classification", "tabular-regression"]
18
-
19
- modalities = {
20
- "nlp": nlp_tasks,
21
- "audio": audio_tasks,
22
- "cv": cv_tasks,
23
- "multimodal": multimodal,
24
- "tabular": tabular,
25
- "rl": ["reinforcement-learning"]
26
- }
27
-
28
- def modality(row):
29
- pipeline = row["pipeline"]
30
- for modality, tasks in modalities.items():
31
- if pipeline in tasks:
32
- return modality
33
- if type(pipeline) == "str":
34
- return "unk_modality"
35
- return None
36
 
 
 
37
  supported_revisions = ["24_10_22", "17_10_22", "10_10_22", "27_09_22"]
38
-
39
- st.cache(allow_output_mutation=True)
40
- def process_dataset(version):
41
- # Load dataset at specified revision
42
- dataset = load_dataset("open-source-metrics/model-repos-stats", revision=version)
43
-
44
- # Convert to pandas dataframe
45
- data = dataset["train"].to_pandas()
46
-
47
- # Add modality column
48
- data["modality"] = data.apply(modality, axis=1)
49
-
50
- # Bin the model card length into some bins
51
- data["length_bins"] = pd.cut(data["text_length"], [0, 200, 1000, 2000, 3000, 4000, 5000, 7500, 10000, 20000, 50000])
52
-
53
- return data
54
-
55
- col1, col2 = st.columns(2)
56
  with col1:
 
 
 
 
 
57
  base = st.selectbox(
58
  'Old revision',
59
  supported_revisions,
60
  index=1)
61
- with col2:
62
- new = st.selectbox(
63
- 'Last revision',
64
  supported_revisions,
65
- index=0)
66
-
 
 
 
 
 
 
 
 
 
 
 
 
67
  old_data = process_dataset(base)
68
  data = process_dataset(new)
69
-
70
- def eval_tags(row):
71
- tags = row["tags"]
72
- if tags == "none" or tags == [] or tags == "{}":
73
- return []
74
- if tags[0] != "[":
75
- tags = str([tags])
76
- val = literal_eval(tags)
77
- if isinstance(val, dict):
78
- return []
79
- return val
80
-
81
  old_data["tags"] = old_data.apply(eval_tags, axis=1)
82
  data["tags"] = data.apply(eval_tags, axis=1)
83
 
 
 
84
  total_samples_old = old_data.shape[0]
85
  total_samples = data.shape[0]
86
- st.metric(label="Total models", value=total_samples, delta=total_samples-total_samples_old)
 
 
 
 
 
 
 
 
87
 
88
  # Tabs don't work in Spaces st version
89
  #tab1, tab2, tab3, tab4, tab5, tab6, tab7, tab8 = st.tabs(["Language", "License", "Pipeline", "Discussion Features", "Libraries", "Model Cards", "Super users", "Raw Data"])
@@ -92,20 +65,9 @@ def main():
92
  'Topic of interest',
93
  ["Language", "License", "Pipeline", "Discussion Features", "Libraries", "Model Cards", "Super Users", "Raw Data"])
94
 
95
- # with tab1:
96
  if tab == "Language":
97
  st.header("Languages info")
98
 
99
- data.loc[data.languages == "False", 'languages'] = None
100
- data.loc[data.languages == {}, 'languages'] = None
101
- old_data.loc[old_data.languages == "False", 'languages'] = None
102
- old_data.loc[old_data.languages == {}, 'languages'] = None
103
-
104
- no_lang_count = data["languages"].isna().sum()
105
- no_lang_count_old = old_data["languages"].isna().sum()
106
- data["languages"] = data["languages"].fillna('none')
107
- old_data["languages"] = old_data["languages"].fillna('none')
108
-
109
  def make_list(row):
110
  languages = row["languages"]
111
  if languages == "none":
@@ -113,34 +75,86 @@ def main():
113
  return literal_eval(languages)
114
 
115
  def language_count(row):
116
- languages = row["languages"]
117
- leng = len(languages)
118
- return leng
119
 
120
- data["languages"] = data.apply(make_list, axis=1)
121
- data["language_count"] = data.apply(language_count, axis=1)
122
- old_data["languages"] = old_data.apply(make_list, axis=1)
123
- old_data["language_count"] = old_data.apply(language_count, axis=1)
124
 
125
- models_with_langs = data[data["language_count"] > 0]
126
- langs = models_with_langs["languages"].explode()
127
- langs = langs[langs != {}]
128
- total_langs = len(langs.unique())
129
 
130
- models_with_langs_old = old_data[old_data["language_count"] > 0]
131
- langs_old = models_with_langs_old["languages"].explode()
132
- langs_old = langs_old[langs_old != {}]
133
- total_langs_old = len(langs_old.unique())
 
134
 
135
- col1, col2, col3 = st.columns(3)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
136
  with col1:
137
- v = total_samples-no_lang_count
138
- v_old = total_samples_old-no_lang_count_old
139
  st.metric(label="Language Specified", value=v, delta=int(v-v_old))
140
  with col2:
 
 
 
 
 
141
  st.metric(label="No Language Specified", value=no_lang_count, delta=int(no_lang_count-no_lang_count_old))
142
- with col3:
 
 
 
 
 
143
  st.metric(label="Total Unique Languages", value=total_langs, delta=int(total_langs-total_langs_old))
 
 
 
 
144
 
145
  st.subheader("Count of languages per model repo")
146
  st.text("Some repos are for multiple languages, so the count is greater than 1")
@@ -148,16 +162,21 @@ def main():
148
  'All or just Multilingual',
149
  ["All", "Just Multilingual", "Three or more languages"])
150
 
151
- filter = 0
152
- st.text("Tofix: This just takes into account count of languages, it misses the multilingual tag")
153
- if linguality == "Just Multilingual":
154
- filter = 1
155
- elif linguality == "Three or more languages":
156
- filter = 2
157
 
158
- models_with_langs = data[data["language_count"] > filter]
 
 
 
 
 
 
 
 
 
 
 
 
159
  df1 = models_with_langs['language_count'].value_counts()
160
- models_with_langs_old = old_data[old_data["language_count"] > filter]
161
  df1_old = models_with_langs_old['language_count'].value_counts()
162
  st.bar_chart(df1)
163
 
@@ -174,13 +193,13 @@ def main():
174
  else:
175
  filter = 2
176
 
177
- models_with_langs = data[data["language_count"] > 0]
178
  langs = models_with_langs["languages"].explode()
179
  langs = langs[langs != {}]
180
  orig_d = langs.value_counts().rename_axis("language").to_frame('counts').reset_index()
181
  d = orig_d
182
 
183
- models_with_langs_old = old_data[old_data["language_count"] > 0]
184
  langs = models_with_langs_old["languages"].explode()
185
  langs = langs[langs != {}]
186
  orig_d_old = langs.value_counts().rename_axis("language").to_frame('counts').reset_index()
@@ -212,6 +231,8 @@ def main():
212
  final_data = pd.merge(
213
  d, orig_d_old, how="outer", on="language"
214
  )
 
 
215
  final_data["diff"] = final_data["counts"].astype(int) - final_data["old_c"].astype(int)
216
 
217
  st.dataframe(final_data)
 
1
  import streamlit as st
2
  import pandas as pd
 
3
  from ast import literal_eval
4
  import altair as alt
 
5
  import matplotlib.pyplot as plt
6
 
7
+ from utils import process_dataset, eval_tags
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
+ def main():
10
+ # Pick revision at top
11
  supported_revisions = ["24_10_22", "17_10_22", "10_10_22", "27_09_22"]
12
+ col1, col2, col3 = st.columns(3)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  with col1:
14
+ new = st.selectbox(
15
+ 'Last revision',
16
+ supported_revisions,
17
+ index=0)
18
+ with col2:
19
  base = st.selectbox(
20
  'Old revision',
21
  supported_revisions,
22
  index=1)
23
+ with col3:
24
+ base_old = st.selectbox(
25
+ 'Very old revision',
26
  supported_revisions,
27
+ index=2)
28
+
29
+ def change_pct(old, new):
30
+ return round(100* (new - old) / new, 3)
31
+
32
+ def change_and_delta(old_old, old, new):
33
+ curr_change = change_pct(old, new)
34
+ prev_change = change_pct(old_old, old)
35
+ delta = f"{curr_change-prev_change}%"
36
+ curr_change = f"{curr_change}%"
37
+ return curr_change, delta
38
+
39
+ # Process dataset
40
+ old_old_data = process_dataset(base_old)
41
  old_data = process_dataset(base)
42
  data = process_dataset(new)
43
+ old_old_data["tags"] = old_old_data.apply(eval_tags, axis=1)
 
 
 
 
 
 
 
 
 
 
 
44
  old_data["tags"] = old_data.apply(eval_tags, axis=1)
45
  data["tags"] = data.apply(eval_tags, axis=1)
46
 
47
+ # High level count of models and rate of change
48
+ total_samples_old_old = old_old_data.shape[0]
49
  total_samples_old = old_data.shape[0]
50
  total_samples = data.shape[0]
51
+
52
+ curr_change, delta = change_and_delta(total_samples_old_old, total_samples_old, total_samples)
53
+
54
+ col1, col2 = st.columns(2)
55
+ with col1:
56
+ st.metric(label="Total models", value=total_samples, delta=total_samples-total_samples_old)
57
+
58
+ with col2:
59
+ st.metric(label="Rate of change", value=curr_change, delta=delta)
60
 
61
  # Tabs don't work in Spaces st version
62
  #tab1, tab2, tab3, tab4, tab5, tab6, tab7, tab8 = st.tabs(["Language", "License", "Pipeline", "Discussion Features", "Libraries", "Model Cards", "Super users", "Raw Data"])
 
65
  'Topic of interest',
66
  ["Language", "License", "Pipeline", "Discussion Features", "Libraries", "Model Cards", "Super Users", "Raw Data"])
67
 
 
68
  if tab == "Language":
69
  st.header("Languages info")
70
 
 
 
 
 
 
 
 
 
 
 
71
  def make_list(row):
72
  languages = row["languages"]
73
  if languages == "none":
 
75
  return literal_eval(languages)
76
 
77
  def language_count(row):
78
+ return len(row["languages"])
 
 
79
 
80
+ def process_for_lang(data):
81
+ # Remove rows without languages
82
+ data.loc[data.languages == "False", 'languages'] = None
83
+ data.loc[data.languages == {}, 'languages'] = None
84
 
85
+ # Count of rows that have no languages
86
+ no_lang_count = data["languages"].isna().sum()
 
 
87
 
88
+ # As the languages column might have multiple languages,
89
+ # we need to convert it to a list. We then count the number of languages.
90
+ data["languages"] = data["languages"].fillna('none')
91
+ data["languages"] = data.apply(make_list, axis=1)
92
+ data["language_count"] = data.apply(language_count, axis=1)
93
 
94
+ # Just keep the models with at least one language
95
+ models_with_langs = data[data["language_count"] > 0]
96
+ langs = models_with_langs["languages"].explode()
97
+ langs = langs[langs != {}]
98
+ total_langs = len(langs.unique())
99
+
100
+ data['multilingual'] = data.apply(lambda x: int("multilingual" in x['languages']), axis=1)
101
+
102
+ return data, no_lang_count, total_langs, langs.unique()
103
+
104
+ filtered_data = data.copy()
105
+ old_filtered_data = old_data.copy()
106
+ old_old_filtered_data = old_old_data.copy()
107
+
108
+ modality = st.selectbox(
109
+ 'Modalities',
110
+ ["All", "NLP", "Audio", "Multimodal"])
111
+
112
+ if modality == "NLP":
113
+ filtered_data = filtered_data[filtered_data["modality"] == "nlp"]
114
+ old_filtered_data = old_filtered_data[old_filtered_data["modality"] == "nlp"]
115
+ old_old_filtered_data = old_old_filtered_data[old_old_filtered_data["modality"] == "nlp"]
116
+ elif modality == "Audio":
117
+ filtered_data = filtered_data[filtered_data["modality"] == "audio"]
118
+ old_filtered_data = old_filtered_data[old_filtered_data["modality"] == "audio"]
119
+ old_old_filtered_data = old_old_filtered_data[old_old_filtered_data["modality"] == "audio"]
120
+ elif modality == "Multimodal":
121
+ filtered_data = filtered_data[filtered_data["modality"] == "multimodal"]
122
+ old_filtered_data = old_filtered_data[old_filtered_data["modality"] == "multimodal"]
123
+ old_old_filtered_data = old_old_filtered_data[old_old_filtered_data["modality"] == "multimodal"]
124
+
125
+
126
+ filtered_data, no_lang_count, total_langs, langs = process_for_lang(filtered_data)
127
+ old_filtered_data, no_lang_count_old, total_langs_old, langs_old = process_for_lang(old_filtered_data)
128
+ old_old_filtered_data, no_lang_count_old_old, total_langs_old_old, _ = process_for_lang(old_old_filtered_data)
129
+
130
+ total_samples_filtered = filtered_data.shape[0]
131
+ total_samples_old_filtered = old_filtered_data.shape[0]
132
+ total_samples_old_old_filtered = old_old_filtered_data.shape[0]
133
+ v = total_samples_filtered-no_lang_count
134
+ v_old = total_samples_old_filtered-no_lang_count_old
135
+ v_old_old = total_samples_old_old_filtered-no_lang_count_old_old
136
+
137
+ col1, col2 = st.columns(2)
138
  with col1:
 
 
139
  st.metric(label="Language Specified", value=v, delta=int(v-v_old))
140
  with col2:
141
+ curr_change, delta = change_and_delta(v_old_old, v_old, v)
142
+ st.metric(label="Language Specified Rate of Change", value=curr_change, delta=delta)
143
+
144
+ col1, col2 = st.columns(2)
145
+ with col1:
146
  st.metric(label="No Language Specified", value=no_lang_count, delta=int(no_lang_count-no_lang_count_old))
147
+ with col2:
148
+ curr_change, delta = change_and_delta(no_lang_count_old_old, no_lang_count_old, no_lang_count)
149
+ st.metric(label="No Language Specified Rate of Change", value=curr_change, delta=delta)
150
+
151
+ col1, col2 = st.columns(2)
152
+ with col1:
153
  st.metric(label="Total Unique Languages", value=total_langs, delta=int(total_langs-total_langs_old))
154
+ with col2:
155
+ curr_change, delta = change_and_delta(total_langs_old_old, total_langs_old, total_langs)
156
+ st.metric(label="Total Unique Languages Rate of Change", value=curr_change, delta=delta)
157
+ st.text(f"New languages {set(langs)-set(langs_old)}")
158
 
159
  st.subheader("Count of languages per model repo")
160
  st.text("Some repos are for multiple languages, so the count is greater than 1")
 
162
  'All or just Multilingual',
163
  ["All", "Just Multilingual", "Three or more languages"])
164
 
 
 
 
 
 
 
165
 
166
+ def filter_multilinguality(data):
167
+ if linguality == "Just Multilingual":
168
+ multilingual_tag = data["multilingual"] == 1
169
+ multiple_lang_tags = data["language_count"] > 1
170
+ return data[multilingual_tag | multiple_lang_tags]
171
+ elif linguality == "Three or more languages":
172
+ return data[data["language_count"] >= 3]
173
+ else:
174
+ return data
175
+
176
+ models_with_langs = filter_multilinguality(filtered_data)
177
+ models_with_langs_old = filter_multilinguality(old_filtered_data)
178
+
179
  df1 = models_with_langs['language_count'].value_counts()
 
180
  df1_old = models_with_langs_old['language_count'].value_counts()
181
  st.bar_chart(df1)
182
 
 
193
  else:
194
  filter = 2
195
 
196
+ models_with_langs = filtered_data[filtered_data["language_count"] > 0]
197
  langs = models_with_langs["languages"].explode()
198
  langs = langs[langs != {}]
199
  orig_d = langs.value_counts().rename_axis("language").to_frame('counts').reset_index()
200
  d = orig_d
201
 
202
+ models_with_langs_old = old_filtered_data[old_filtered_data["language_count"] > 0]
203
  langs = models_with_langs_old["languages"].explode()
204
  langs = langs[langs != {}]
205
  orig_d_old = langs.value_counts().rename_axis("language").to_frame('counts').reset_index()
 
231
  final_data = pd.merge(
232
  d, orig_d_old, how="outer", on="language"
233
  )
234
+ print(final_data["counts"].isna().sum())
235
+ print(final_data["old_c"].isna().sum())
236
  final_data["diff"] = final_data["counts"].astype(int) - final_data["old_c"].astype(int)
237
 
238
  st.dataframe(final_data)
utils.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset
2
+ import streamlit as st
3
+ from ast import literal_eval
4
+ import pandas as pd
5
+
6
+
7
+ nlp_tasks = ["text-classification", "text-generation", "text2text-generation", "token-classification", "fill-mask", "question-answering",
8
+ "translation", "conversational", "sentence-similarity", "summarization", "multiple-choice", "zero-shot-classification", "table-question-answering"
9
+ ]
10
+ audio_tasks = ["automatic-speech-recognition", "audio-classification", "text-to-speech", "audio-to-audio", "voice-activity-detection"]
11
+ cv_tasks = ["image-classification", "image-segmentation", "zero-shot-image-classification", "image-to-image", "unconditional-image-generation", "object-detection"]
12
+ multimodal = ["feature-extraction", "text-to-image", "visual-question-answering", "image-to-text", "document-question-answering"]
13
+ tabular = ["tabular-classification", "tabular-regression"]
14
+
15
+ modalities = {
16
+ "nlp": nlp_tasks,
17
+ "audio": audio_tasks,
18
+ "cv": cv_tasks,
19
+ "multimodal": multimodal,
20
+ "tabular": tabular,
21
+ "rl": ["reinforcement-learning"]
22
+ }
23
+
24
+ def modality(row):
25
+ pipeline = row["pipeline"]
26
+ for modality, tasks in modalities.items():
27
+ if pipeline in tasks:
28
+ return modality
29
+ if type(pipeline) == "str":
30
+ return "unk_modality"
31
+ return None
32
+
33
+ st.cache(allow_output_mutation=True)
34
+ def process_dataset(version):
35
+ # Load dataset at specified revision
36
+ dataset = load_dataset("open-source-metrics/model-repos-stats", revision=version)
37
+
38
+ # Convert to pandas dataframe
39
+ data = dataset["train"].to_pandas()
40
+
41
+ # Add modality column
42
+ data["modality"] = data.apply(modality, axis=1)
43
+
44
+ # Bin the model card length into some bins
45
+ data["length_bins"] = pd.cut(data["text_length"], [0, 200, 1000, 2000, 3000, 4000, 5000, 7500, 10000, 20000, 50000])
46
+
47
+ return data
48
+
49
+ def eval_tags(row):
50
+ tags = row["tags"]
51
+ if tags == "none" or tags == [] or tags == "{}":
52
+ return []
53
+ if tags[0] != "[":
54
+ tags = str([tags])
55
+ val = literal_eval(tags)
56
+ if isinstance(val, dict):
57
+ return []
58
+ return val
59
+
60
+ def change_pct(old, new):
61
+ return round(100* (new - old) / new, 3)
62
+
63
+ def change_and_delta(old_old, old, new):
64
+ curr_change = change_pct(old, new)
65
+ prev_change = change_pct(old_old, old)
66
+ delta = round(curr_change-prev_change, 3)
67
+ delta = f"{delta}%"
68
+ curr_change = f"{curr_change}%"
69
+ return curr_change, delta