speech-test commited on
Commit
582e085
β€’
1 Parent(s): 47e279a

Metrics editor

Browse files
Files changed (2) hide show
  1. app.py +118 -10
  2. requirements.txt +2 -1
app.py CHANGED
@@ -6,6 +6,7 @@ import requests
6
  import streamlit as st
7
  import yaml
8
  from huggingface_hub import hf_hub_download
 
9
  from streamlit_tags import st_tags
10
 
11
  # exact same regex as in the Hub server. Please keep in sync.
@@ -93,7 +94,7 @@ def main():
93
  "[our list here](https://huggingface.co/spaces/huggingface/hf-speech-bench/blob/main/languages.json). "
94
  "When in doubt, use the most generic language code, e.g. `en` instead of `en-GB` and `en-US`."
95
  )
96
- st.markdown("*Example*: `cs, hsb, pl`")
97
  metadata["language"] = metadata["language"] if "language" in metadata else []
98
  metadata["language"] = (
99
  metadata["language"]
@@ -107,13 +108,19 @@ def main():
107
  lang_names = [lang2name[lang] if lang in lang2name else lang for lang in languages]
108
  st.markdown("These languages will be parsed by the leaderboard as: ")
109
  st.code(", ".join(lang_names))
 
110
 
111
  ############################
112
  # TRAIN DATASETS
113
  ############################
114
  st.markdown("### Training dataset(s)")
115
- st.markdown("List the datasets that your model was trained on.")
116
- st.markdown("*Example*: `librispeech_asr, mozilla-foundation/common_voice_8_0`")
 
 
 
 
 
117
 
118
  if "datasets" not in metadata:
119
  metadata["datasets"] = []
@@ -126,6 +133,7 @@ def main():
126
  "WARNING: `common_voice` is deprecated, please replace it with its equivalent: "
127
  "`mozilla-foundation/common_voice_6_1`"
128
  )
 
129
 
130
  ############################
131
  # MODEL NAME
@@ -134,16 +142,116 @@ def main():
134
  st.markdown("Enter a descriptive name for your model.")
135
  st.markdown("*Example*: `XLS-R Wav2Vec2 LM Spanish by Jane Doe`")
136
 
137
- if "model_index" not in metadata:
138
- metadata["model_index"] = [{}]
139
- if "name" not in ["model_index"][0]:
140
- metadata["model_index"][0]["name"] = model_id.split("/")[-1]
141
- model_name = st.text_input("", value=metadata["model_index"][0]["name"])
 
142
 
143
  ############################
144
- # EVAL DATASETS
145
  ############################
146
- st.markdown("### Evaluation metrics")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147
 
148
 
149
  if __name__ == "__main__":
 
6
  import streamlit as st
7
  import yaml
8
  from huggingface_hub import hf_hub_download
9
+ from streamlit_ace import st_ace
10
  from streamlit_tags import st_tags
11
 
12
  # exact same regex as in the Hub server. Please keep in sync.
 
94
  "[our list here](https://huggingface.co/spaces/huggingface/hf-speech-bench/blob/main/languages.json). "
95
  "When in doubt, use the most generic language code, e.g. `en` instead of `en-GB` and `en-US`."
96
  )
97
+ st.markdown("*Example*: `en, gsw, pt-BR`")
98
  metadata["language"] = metadata["language"] if "language" in metadata else []
99
  metadata["language"] = (
100
  metadata["language"]
 
108
  lang_names = [lang2name[lang] if lang in lang2name else lang for lang in languages]
109
  st.markdown("These languages will be parsed by the leaderboard as: ")
110
  st.code(", ".join(lang_names))
111
+ metadata["language"] = languages
112
 
113
  ############################
114
  # TRAIN DATASETS
115
  ############################
116
  st.markdown("### Training dataset(s)")
117
+ st.markdown(
118
+ "List the datasets that your model was **trained** on. "
119
+ "If the datasets aren't published on the Hub yet, just add their names anyway."
120
+ )
121
+ st.markdown(
122
+ "*Example*: `librispeech_asr, mozilla-foundation/common_voice_8_0, my_custom_youtube_dataset`"
123
+ )
124
 
125
  if "datasets" not in metadata:
126
  metadata["datasets"] = []
 
133
  "WARNING: `common_voice` is deprecated, please replace it with its equivalent: "
134
  "`mozilla-foundation/common_voice_6_1`"
135
  )
136
+ metadata["datasets"] = train_datasets
137
 
138
  ############################
139
  # MODEL NAME
 
142
  st.markdown("Enter a descriptive name for your model.")
143
  st.markdown("*Example*: `XLS-R Wav2Vec2 LM Spanish by Jane Doe`")
144
 
145
+ if "model-index" not in metadata:
146
+ metadata["model-index"] = [{}]
147
+ if "name" not in ["model-index"][0]:
148
+ metadata["model-index"][0]["name"] = model_id.split("/")[-1]
149
+ model_name = st.text_input("", value=metadata["model-index"][0]["name"])
150
+ metadata["model-index"][0]["name"] = model_name
151
 
152
  ############################
153
+ # EVAL RESULTS
154
  ############################
155
+ st.markdown("### Evaluation results")
156
+ st.markdown("To edit the metrics, you can either use the YAML editor below, or add new metrics using the handy "
157
+ "form under it.")
158
+ if "results" not in metadata["model-index"][0]:
159
+ metadata["model-index"][0]["results"] = []
160
+
161
+ results_editor = st.empty()
162
+ with results_editor:
163
+ results_yaml = yaml.dump(
164
+ metadata["model-index"][0]["results"], sort_keys=False, line_break="\n"
165
+ )
166
+ results_yaml = st_ace(value=results_yaml, language="yaml")
167
+ metadata["model-index"][0]["results"] = try_parse_yaml(results_yaml)
168
+
169
+ with st.form(key="eval_form"):
170
+ dataset_name = st.text_input(
171
+ label="Full name of the dataset", placeholder="Common Voice 8.0"
172
+ )
173
+ dataset_path = st.text_input(
174
+ label="Dataset path / id", placeholder="mozilla-foundation/common_voice_8_0"
175
+ )
176
+ dataset_config = st.text_input(
177
+ label="Dataset config (language). Examples: en, pt-BR, clean",
178
+ placeholder="en",
179
+ )
180
+ metric_name = st.text_input(label="Metric name", placeholder="Test WER (+LM)")
181
+ metric2name = {"wer": "Word Error Rate", "cer": "Character Error Rate"}
182
+ metric_type = st.selectbox(
183
+ label="Metric",
184
+ options=["wer", "cer"],
185
+ format_func=lambda key: metric2name[key],
186
+ )
187
+ metric_value = st.text_input(
188
+ label="Metric value (0.0 - 100.0)",
189
+ placeholder="12.34",
190
+ )
191
+ try:
192
+ metric_value = float(metric_value)
193
+ except ValueError:
194
+ st.error(f"Couldn't parse `{metric_value}`. Make sure it's a number from 0.0 to 100.0")
195
+
196
+ submitted = st.form_submit_button("Submit")
197
+ if submitted:
198
+ metric = {
199
+ "name": metric_name,
200
+ "type": metric_type,
201
+ "value": metric_value,
202
+ }
203
+ # first, try to find an existing dataset+config record to add a new metric to it
204
+ updated_existing = False
205
+ for existing_result in metadata["model-index"][0]["results"]:
206
+ existing_dataset = existing_result["dataset"]
207
+ if (
208
+ existing_dataset["type"] == dataset_path
209
+ and existing_dataset["args"] == dataset_config
210
+ ):
211
+ if "metrics" not in existing_result:
212
+ existing_result["metrics"] = []
213
+ existing_result["metrics"].append(metric)
214
+ updated_existing = True
215
+ break
216
+ # if no dataset+config results found, create a new one
217
+ if not updated_existing:
218
+ result = {
219
+ "task": {
220
+ "name": "Automatic Speech Recognition",
221
+ "type": "automatic-speech-recognition",
222
+ },
223
+ "dataset": {
224
+ "name": dataset_name,
225
+ "type": dataset_path,
226
+ "args": dataset_config,
227
+ },
228
+ "metrics": [metric],
229
+ }
230
+ metadata["model-index"][0]["results"].append(result)
231
+
232
+ # update the code editor
233
+ with results_editor:
234
+ results_yaml = yaml.dump(
235
+ metadata["model-index"][0]["results"],
236
+ sort_keys=False,
237
+ line_break="\n",
238
+ )
239
+ results_yaml = st_ace(value=results_yaml, language="yaml")
240
+ metadata["model-index"][0]["results"] = try_parse_yaml(results_yaml)
241
+ st.success(f"Added the metric for {dataset_path} - {dataset_config}! "
242
+ f"Check the result in the YAML editor above.")
243
+
244
+ ############################
245
+ # FINAL YAML
246
+ ############################
247
+ st.markdown("## 3. Copy the generated metadata")
248
+ st.markdown(
249
+ "Copy the YAML from below and replace the metadata at the top of your model's README.md here: "
250
+ f"https://huggingface.co/{model_id}/blob/main/README.md"
251
+ )
252
+
253
+ new_yaml = yaml.dump(metadata, sort_keys=False, line_break="\n")
254
+ st.markdown(f"```yaml\n---\n{new_yaml}---\n```")
255
 
256
 
257
  if __name__ == "__main__":
requirements.txt CHANGED
@@ -1 +1,2 @@
1
- streamlit-tags
 
 
1
+ streamlit-tags
2
+ streamlit-ace