Spaces:

huggingface
/

speech-bench-metrics-editor

Runtime error

App Files Files Community

speech-bench-metrics-editor / app.py

anton-l HF staff

Fix tags widget key

45190d8 about 2 years ago

raw

history blame

No virus

12.5 kB

	import json
	import re
	from pathlib import Path

	import requests
	import streamlit as st
	import yaml
	from huggingface_hub import hf_hub_download
	from streamlit_ace import st_ace
	from streamlit_tags import st_tags

	# exact same regex as in the Hub server. Please keep in sync.
	REGEX_YAML_BLOCK = re.compile(r"---[\n\r]+([\S\s]*?)[\n\r]+---[\n\r]")

	with open("languages.json") as f:
	lang2name = json.load(f)


	def try_parse_yaml(yaml_block):
	try:
	metadata = yaml.load(yaml_block, yaml.SafeLoader)
	except yaml.YAMLError as e:
	print("Error while parsing the metadata YAML:")
	if hasattr(e, "problem_mark"):
	if e.context is not None:
	st.error(
	str(e.problem_mark)
	+ "\n "
	+ str(e.problem)
	+ " "
	+ str(e.context)
	+ "\nPlease correct the README.md and retry."
	)
	else:
	st.error(
	str(e.problem_mark)
	+ "\n "
	+ str(e.problem)
	+ "\nPlease correct the README.md and retry."
	)
	else:
	st.error(
	"Something went wrong while parsing the metadata. "
	"Make sure it's written according to the YAML spec!"
	)
	return None
	return metadata


	def main():
	st.markdown("# The 🤗 Speech Bench Metrics Editor")
	st.markdown("This tool will help you report the evaluation metrics for all of your speech recognition models. "
	"Follow the steps and watch your models appear on the [Speech Bench Leaderboard](https://huggingface.co/spaces/huggingface/hf-speech-bench)!")
	st.markdown("## 1. Load your model's metadata")
	st.markdown("Enter your model's path below.")
	model_id = st.text_input("", placeholder="<username>/<model>")
	if not model_id.strip():
	st.stop()
	try:
	readme_path = hf_hub_download(model_id, filename="README.md")
	except requests.exceptions.HTTPError:
	st.error(
	f"ERROR: https://huggingface.co/{model_id}/blob/main/README.md "
	f"not found, make sure you've entered a correct model path and created a model card for it!"
	)
	st.stop()

	content = Path(readme_path).read_text()
	match = REGEX_YAML_BLOCK.search(content)
	if match:
	meta_yaml = match.group(1)
	else:
	st.error(
	"ERROR: Couldn't find the metadata section inside your model's `README.md`. Do you have some basic metadata "
	"enclosed in `---` as described in [the Hub documentation](https://huggingface.co/docs/hub/model-repos#model-card-metadata)?"
	)
	st.stop()

	metadata = try_parse_yaml(meta_yaml)
	if metadata is None:
	st.stop()
	else:
	st.success("Successfully loaded the metadata!")
	with st.expander("Inspect the parsed metadata for debugging"):
	st.json(metadata)

	st.markdown("## 2. Edit the data")

	############################
	# LANGUAGES
	############################
	st.markdown("### Language(s)")
	st.markdown(
	"For each spoken language that your model handles, enter an "
	"[ISO 639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) language code, or "
	"find an appropriate alternative from "
	"[our list here](https://huggingface.co/spaces/huggingface/hf-speech-bench/blob/main/languages.json). "
	"When in doubt, use the most generic language code, e.g. `en` instead of `en-GB` and `en-US`."
	)
	st.markdown("Example: `en, gsw, pt-BR`")
	metadata["language"] = metadata["language"] if "language" in metadata else []
	metadata["language"] = (
	metadata["language"]
	if isinstance(metadata["language"], list)
	else [metadata["language"]]
	)

	languages = st_tags(
	label="", text="add more if needed, and press enter", value=metadata["language"],
	key=model_id+"_langs"
	)
	lang_names = [lang2name[lang] if lang in lang2name else lang for lang in languages]
	st.markdown("These languages will be parsed by the leaderboard as: ")
	st.code(", ".join(lang_names))
	metadata["language"] = languages

	############################
	# TRAIN DATASETS
	############################
	st.markdown("### Training dataset(s)")
	st.markdown(
	"List the datasets that your model was trained on. "
	"If the datasets aren't published on the Hub yet, just add their names anyway."
	)
	st.markdown(
	"Example: `librispeech_asr, mozilla-foundation/common_voice_8_0, my_custom_youtube_dataset`"
	)

	if "datasets" not in metadata:
	metadata["datasets"] = []

	train_datasets = st_tags(
	label="", text="add more if needed, and press enter", value=metadata["datasets"],
	key=model_id+"_train_dataset"
	)
	if "common_voice" in train_datasets:
	st.warning(
	"WARNING: `common_voice` is deprecated, please replace it with its equivalent: "
	"`mozilla-foundation/common_voice_6_1`"
	)
	metadata["datasets"] = train_datasets

	############################
	# MODEL NAME
	############################
	st.markdown("### Model name")
	st.markdown("Enter a pretty name for your model.")
	st.markdown("Example: `XLS-R Wav2Vec2 LM Spanish by Jane Doe`")

	if "model-index" not in metadata:
	metadata["model-index"] = [{}]
	if "name" not in ["model-index"][0]:
	metadata["model-index"][0]["name"] = model_id.split("/")[-1]
	model_name = st.text_input("", value=metadata["model-index"][0]["name"])
	metadata["model-index"][0]["name"] = model_name

	############################
	# EVAL RESULTS
	############################
	st.markdown("### Evaluation results")
	st.markdown(
	"To edit the metrics, you can either use the YAML editor below, or add new metrics using the handy "
	"form under it."
	)
	if "results" not in metadata["model-index"][0]:
	metadata["model-index"][0]["results"] = []

	results_editor = st.empty()
	with results_editor:
	results_yaml = yaml.dump(
	metadata["model-index"][0]["results"], sort_keys=False, line_break="\n"
	)
	results_yaml = st_ace(value=results_yaml, language="yaml")
	metadata["model-index"][0]["results"] = try_parse_yaml(results_yaml)

	dataset_path_kwargs = {}
	dataset_name_kwargs = {}
	if (
	len(metadata["model-index"][0]["results"]) > 0
	and "dataset" in metadata["model-index"][0]["results"][0]
	):
	if "type" in metadata["model-index"][0]["results"][0]["dataset"]:
	dataset_path_kwargs["value"] = metadata["model-index"][0]["results"][0][
	"dataset"
	]["type"]
	if "name" in metadata["model-index"][0]["results"][0]["dataset"]:
	dataset_name_kwargs["value"] = metadata["model-index"][0]["results"][0][
	"dataset"
	]["name"]

	with st.form(key="eval_form"):
	dataset_path = st.text_input(
	label="Dataset path / id",
	placeholder="mozilla-foundation/common_voice_8_0",
	**dataset_path_kwargs,
	)
	dataset_name = st.text_input(
	label="A pretty name for the dataset. Examples: 'Common Voice 9.0 (French)', 'LibriSpeech (clean)'",
	placeholder="Common Voice 8.0 (French)",
	**dataset_name_kwargs,
	)
	dataset_config = st.text_input(
	label="Dataset configuration. Examples: clean, other, en, pt-BR",
	placeholder="en",
	)
	dataset_language_kwargs = {"value": languages[0]} if len(languages) > 0 else {}
	dataset_language = st.text_input(
	label="Dataset language. Examples: en, pt-BR",
	placeholder="en",
	**dataset_language_kwargs
	)
	dataset_split = st.text_input(
	label="Dataset split. Examples: test, validation",
	value="test",
	placeholder="test",
	)
	metric2name = {"wer": "Word Error Rate", "cer": "Character Error Rate"}
	metric_type = st.selectbox(
	label="Metric",
	options=["wer", "cer"],
	format_func=lambda key: metric2name[key],
	)
	metric_name = st.text_input(
	label="A pretty name for the metric. Example: Test WER (+LM)",
	placeholder="Test WER",
	value="Test WER",
	)
	metric_value = st.text_input(
	label="Metric value. Use values in range 0.0 to 100.0.",
	placeholder="12.34",
	)
	# try:
	# metric_value = float(metric_value)
	# except ValueError:
	# st.error(
	# f"Couldn't parse `{metric_value}`. Make sure it's a number from 0.0 to 100.0"
	# )

	submitted = st.form_submit_button("Add metric")
	if (
	submitted
	and dataset_name
	and dataset_path
	and dataset_config
	and dataset_split
	and dataset_language
	and metric_name
	and metric_type
	and metric_value
	):
	metric = {
	"name": metric_name,
	"type": metric_type,
	"value": metric_value,
	}
	# first, try to find an existing dataset+config record to add a new metric to it
	updated_existing = False
	for existing_result in metadata["model-index"][0]["results"]:
	existing_dataset = existing_result["dataset"]
	if (
	existing_dataset["type"] == dataset_path
	and "config" in existing_dataset
	and existing_dataset["config"] == dataset_config
	and "split" in existing_dataset
	and existing_dataset["split"] == dataset_split
	):
	if "metrics" not in existing_result:
	existing_result["metrics"] = []
	existing_result["metrics"].append(metric)
	updated_existing = True
	break
	# if no dataset+config results found, create a new one
	if not updated_existing:
	result = {
	"task": {
	"name": "Automatic Speech Recognition",
	"type": "automatic-speech-recognition",
	},
	"dataset": {
	"name": dataset_name,
	"type": dataset_path,
	"config": dataset_config,
	"split": dataset_split,
	"args": {"language": dataset_language},
	},
	"metrics": [metric],
	}
	metadata["model-index"][0]["results"].append(result)

	# update the code editor
	with results_editor:
	results_yaml = yaml.dump(
	metadata["model-index"][0]["results"],
	sort_keys=False,
	line_break="\n",
	)
	results_yaml = st_ace(value=results_yaml, language="yaml")
	metadata["model-index"][0]["results"] = try_parse_yaml(results_yaml)
	st.success(
	f"Added the metric for {dataset_path} - {dataset_config}! "
	f"Check the result in the YAML editor above."
	)
	elif submitted:
	st.error(
	f"Make sure that you've filled the whole form before clicking 'Add metric'!"
	)

	############################
	# FINAL YAML
	############################
	st.markdown("## 3. Copy the generated metadata")
	st.markdown(
	"Copy the YAML from below and replace the metadata at the top of your model's README.md here: "
	f"https://huggingface.co/{model_id}/edit/main/README.md"
	)
	st.markdown("For mor info on the metadata schema please refer to "
	"https://raw.githubusercontent.com/huggingface/hub-docs/main/modelcard.md")

	new_yaml = yaml.dump(metadata, sort_keys=False, line_break="\n")
	st.markdown(f"```yaml\n---\n{new_yaml}---\n```")


	if __name__ == "__main__":
	main()