speech-test commited on
Commit
743f616
β€’
1 Parent(s): dcde789

apply suggestions

Browse files
Files changed (1) hide show
  1. app.py +52 -33
app.py CHANGED
@@ -13,7 +13,6 @@ suggested_datasets = [
13
  "librispeech_asr",
14
  "mozilla-foundation/common_voice_8_0",
15
  "mozilla-foundation/common_voice_7_0",
16
- "common_voice",
17
  "speech-recognition-community-v2/eval_data",
18
  ]
19
 
@@ -101,38 +100,32 @@ def get_data():
101
  return pd.DataFrame.from_records(data)
102
 
103
 
104
- def suggest_datasets(datasets):
105
- ranked = set(suggested_datasets) & set(datasets)
106
- ranked = sorted(ranked, key=suggested_datasets.index)[:3]
107
- ranked = [f"* *{dataset_id}*\n" for dataset_id in ranked]
108
- if len(ranked) > 0:
109
- return f"""
110
- For general-purpose automatic speech recognition, we currently advise to pick a model that performs well on:
111
- {"".join(ranked)}
112
- """
113
- else:
114
- return ""
115
 
116
 
117
  @st.cache(ttl=600)
118
- def generate_note(lang, lang_df):
119
- lang_name = lang2name[lang] if lang in lang2name else ""
120
- num_models = len(lang_df["model_id"].unique())
121
- unique_datasets = sorted(lang_df["dataset"].unique())
122
- num_datasets = len(unique_datasets)
123
  msg = f"""
124
- For the `{lang}` ({lang_name}) language, there are currently `{num_models}` models
125
- trained on `{num_datasets}` datasets available for `automatic-speech-recognition`.
126
-
127
  The models have been trained and/or evaluated on the following datasets:
128
  """
129
- for dataset_id in unique_datasets:
130
- msg += f"* [{dataset_id}](https://hf.co/datasets/{dataset_id})\n"
 
 
 
131
  msg += """
132
  Choose the dataset that is most relevant to your task and select it from the dropdown below.
133
  """
134
- msg += suggest_datasets(unique_datasets)
135
- msg += "Please click on the model's name to be redirected to its model card which includes documentation and examples on how to use it."
136
 
137
  msg = "\n".join([line.strip() for line in msg.split("\n")])
138
  return msg
@@ -140,7 +133,6 @@ def generate_note(lang, lang_df):
140
 
141
  dataframe = get_data()
142
  dataframe = dataframe.fillna("")
143
- dataframe["model_id"] = dataframe["model_id"].apply(make_clickable)
144
 
145
  _, col_center = st.columns([3, 6])
146
  with col_center:
@@ -148,26 +140,40 @@ with col_center:
148
  st.markdown("# Speech Recognition Models Leaderboard")
149
 
150
  st.markdown(
151
- "This is a leaderboard over all speech recognition models and datasets. "
152
- "Please select a language you want to find a model for from the dropdown:"
153
  )
154
 
155
- lang = st.selectbox(
156
  "Language",
157
  sorted(dataframe["lang"].unique()),
 
158
  index=0,
159
  )
160
  lang_df = dataframe[dataframe.lang == lang]
161
 
162
- msg = generate_note(lang, lang_df)
163
- st.markdown(msg)
164
 
165
- dataset = st.selectbox(
 
 
 
 
 
 
 
 
 
 
 
 
166
  "Dataset",
167
- sorted(lang_df["dataset"].unique()),
168
  index=0,
169
  )
170
  dataset_df = lang_df[lang_df.dataset == dataset]
 
 
171
  if lang in cer_langs:
172
  dataset_df = dataset_df[["model_id", "cer"]]
173
  dataset_df.sort_values("cer", inplace=True)
@@ -183,7 +189,20 @@ dataset_df.rename(
183
  inplace=True,
184
  )
185
 
186
- st.write(dataset_df.to_html(escape=False, index=None), unsafe_allow_html=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
187
 
188
  if lang in cer_langs:
189
  st.markdown(
 
13
  "librispeech_asr",
14
  "mozilla-foundation/common_voice_8_0",
15
  "mozilla-foundation/common_voice_7_0",
 
16
  "speech-recognition-community-v2/eval_data",
17
  ]
18
 
 
100
  return pd.DataFrame.from_records(data)
101
 
102
 
103
+ def sort_datasets(datasets):
104
+ # 1. sort by name
105
+ datasets = sorted(datasets)
106
+ # 2. bring the suggested datasets to the top and append the rest
107
+ datasets = sorted(
108
+ datasets,
109
+ key=lambda dataset_id: suggested_datasets.index(dataset_id)
110
+ if dataset_id in suggested_datasets
111
+ else len(suggested_datasets),
112
+ )
113
+ return datasets
114
 
115
 
116
  @st.cache(ttl=600)
117
+ def generate_dataset_info(datasets):
 
 
 
 
118
  msg = f"""
 
 
 
119
  The models have been trained and/or evaluated on the following datasets:
120
  """
121
+ for dataset_id in datasets:
122
+ if dataset_id in suggested_datasets:
123
+ msg += f"* [{dataset_id}](https://hf.co/datasets/{dataset_id}) *(recommended)*\n"
124
+ else:
125
+ msg += f"* [{dataset_id}](https://hf.co/datasets/{dataset_id})\n"
126
  msg += """
127
  Choose the dataset that is most relevant to your task and select it from the dropdown below.
128
  """
 
 
129
 
130
  msg = "\n".join([line.strip() for line in msg.split("\n")])
131
  return msg
 
133
 
134
  dataframe = get_data()
135
  dataframe = dataframe.fillna("")
 
136
 
137
  _, col_center = st.columns([3, 6])
138
  with col_center:
 
140
  st.markdown("# Speech Recognition Models Leaderboard")
141
 
142
  st.markdown(
143
+ "This is a leaderboard over all speech recognition models and datasets.\n\n"
144
+ "β¬… Please select a language you want to find a model for from the dropdown on the left."
145
  )
146
 
147
+ lang = st.sidebar.selectbox(
148
  "Language",
149
  sorted(dataframe["lang"].unique()),
150
+ format_func=lambda key: lang2name.get(key, key),
151
  index=0,
152
  )
153
  lang_df = dataframe[dataframe.lang == lang]
154
 
155
+ sorted_datasets = sort_datasets(lang_df["dataset"].unique())
 
156
 
157
+ text = generate_dataset_info(sorted_datasets)
158
+ st.sidebar.markdown(text)
159
+
160
+ lang_name = lang2name[lang] if lang in lang2name else ""
161
+ num_models = len(lang_df["model_id"].unique())
162
+ num_datasets = len(lang_df["dataset"].unique())
163
+ text = f"""
164
+ For the `{lang}` ({lang_name}) language, there are currently `{num_models}` model(s)
165
+ trained on `{num_datasets}` dataset(s) available for `automatic-speech-recognition`.
166
+ """
167
+ st.markdown(text)
168
+
169
+ dataset = st.sidebar.selectbox(
170
  "Dataset",
171
+ sorted_datasets,
172
  index=0,
173
  )
174
  dataset_df = lang_df[lang_df.dataset == dataset]
175
+
176
+ # sort by WER or CER depending on the language
177
  if lang in cer_langs:
178
  dataset_df = dataset_df[["model_id", "cer"]]
179
  dataset_df.sort_values("cer", inplace=True)
 
189
  inplace=True,
190
  )
191
 
192
+ st.markdown(
193
+ "Please click on the model's name to be redirected to its model card which includes documentation and examples on how to use it."
194
+ )
195
+
196
+ # display the model ranks
197
+ dataset_df = dataset_df.reset_index(drop=True)
198
+ dataset_df.index += 1
199
+
200
+ # turn the model ids into clickable links
201
+ dataset_df["model_id"] = dataset_df["model_id"].apply(make_clickable)
202
+
203
+ table_html = dataset_df.to_html(escape=False)
204
+ table_html = table_html.replace("<th>", '<th align="left">') # left-align the headers
205
+ st.write(table_html, unsafe_allow_html=True)
206
 
207
  if lang in cer_langs:
208
  st.markdown(