import streamlit as st from pandas import read_csv import os from evaluate import load from huggingface_hub import Repository import zipfile # first define URLs for the reference and submission datasets on the Hub REFERENCE_NAME = "references" SUBMISSION_NAME = "submissions" REFERENCE_URL = os.path.join("https://huggingface.co/datasets/xtreme-s", REFERENCE_NAME) SUBMISSION_URL = os.path.join("https://huggingface.co/datasets/xtreme-s", SUBMISSION_NAME) # grab these repos using the token provided HF_TOKEN = os.environ.get("HF_TOKEN") reference_repo = Repository(local_dir="references", clone_from=REFERENCE_URL, use_auth_token=HF_TOKEN) submission_repo = Repository(local_dir="submissions", clone_from=SUBMISSION_URL, use_auth_token=HF_TOKEN) submission_repo.git_pull() all_submissions = [ folder for folder in os.listdir(SUBMISSION_NAME) if os.path.isdir(os.path.join(SUBMISSION_NAME, folder)) and folder != ".git" ] # define the XTREME-S test sets TEST_SETS = [ "fleurs", "mls", "vp", "covost-2", "f-lid", "m-14", ] EXPECTED_TEST_FILES = [f + ".txt" for f in TEST_SETS] # define the optional test sets - ignore for now OPTIONAL_TEST_SETS = [] # ["f-r5"] OPTIONAL_TEST_FILES = [f + ".txt" for f in OPTIONAL_TEST_SETS] # load all metrics wer_metric = load("wer") bleu_metric = load("bleu") acc_metric = load("accuracy") f1_metric = load("f1") # map test set to metric METRIC_MAP = { "fleurs": wer_metric, "mls": wer_metric, "vp": wer_metric, "covost-2": bleu_metric, "f-lid": acc_metric, "m-14": f1_metric, } def compute_score(pred_file, ref_file, metric): """Assess predicted file against reference file for a given metric.""" with open(pred_file, "r", encoding="utf-8") as pred, open(ref_file, "r", encoding="utf-8") as ref: # TODO: any post-processing required? pred_lines = [line.strip() for line in pred.readlines()] ref_lines = [line.strip() for line in ref.readlines()] score = metric(ref_lines, pred_lines) return score # load up the results file CSV_RESULTS_FILE = os.path.join(SUBMISSION_NAME, "results.csv") all_results = read_csv(CSV_RESULTS_FILE) # Write table form CSV table = all_results.copy() # make sure the column ordering is correct (name, fleurs, mls, ..., average-score) average_column = table.pop("average-score") name_column = table.pop("name") num_columns = len(table.columns) table.insert(num_columns, "average-score", average_column) table = table.select_dtypes(exclude=["object", "string"]) table.insert(0, "name", name_column) # sort by average score table = table.sort_values(by=["average-score"], ascending=False, ignore_index=True) table = table.round(2) # start ranking from index 1 table.index = table.index + 1 # Streamlit st.markdown("# XTREME-S: Evaluating Cross-lingual Speech Representations") st.markdown( "This is the leaderboard for the XTREME-S benchmark. Submitted systems are ranked by the **average score**, which" " is a weighted average of the mandatory test sets:" ) # hacky way of getting math-mode to render st.write( r""" $$ \begin{gathered} 0.4 *\left(100-\frac{\text{Fleurs}+\text{MLS}+\text{VP}}{3}\right)_{(\mathrm{WER})}+ \\ 0.4 * \text{CoVoST-2}_{(\mathrm{BLEU})}+0.2 *\left(\frac{\text{F-LID}+\text{M-14}}{2}\right)_{(\mathrm{Acc})} \end{gathered} $$ """ ) st.markdown("The optional dataset of f-r5 does not contribute to the average score.") # st.table(table) st.dataframe(table.style.format(subset=["average-score", *TEST_SETS, *OPTIONAL_TEST_SETS], formatter="{:.1f}")) st.markdown( """ XTREME-S was proposed in *XTREME-S: Evaluating Cross-lingual Speech Representations*, by Conneau et. al. \n The abstract of the paper is as follows: \n *We introduce XTREME-S, a new benchmark to evaluate universal cross-lingual speech representations in many languages. XTREME-S covers four task families: speech recognition, classification, speech-to-text translation and retrieval. Covering 102 languages from 10+ language families, 3 different domains and 4 task families, XTREME-S aims to simplify multilingual speech representation evaluation, as well as catalyze research in "universal" speech representation learning. This paper describes the new benchmark and establishes the first speech-only and speech-text baselines using XLS-R and mSLAM on all downstream tasks. We motivate the design choices and detail how to use the benchmark.* \n For more information, refer to the paper submission on [Arxiv](https://arxiv.org/abs/2203.10752). """ ) st.markdown( """ ## Submitting to XTREME-S \n To submit to XTREME-S, download the audio data for the mandatory XTREME-S test sets from [xtreme-s/datasets](https://huggingface.co/datasets/xtreme-s/datasets). The test sets contain audio data only. Evaluate your system on the six test sets by generating predictions for the unlabelled audio samples. For each test set, save the predictions to a .txt file in the order that the audio samples are provided, with one prediction per line. Name the .txt file according to the XTREME-S test set names shown in the table (e.g. the predictions for FLEURS should be named fleurs.txt). \n Once you have evaluated your system on all of the six mandatory test sets, move the predictions into one folder and zip it. The name you assign to the zipped folder will be the name that is shown on the table (e.g. mSLAM.zip will be displayed as mSLAM). Upload your zipped submissions for scoring and placement on the leaderboard. \n Should you experience any issues, open an issue using the link [new discussion](https://huggingface.co/spaces/xtreme-s/leaderboard/discussions/new) and tag `@sanchit-gandhi`. """ ) # Using the "with" syntax with st.form(key="my_form"): uploaded_file = st.file_uploader("Choose a zip file") submit_button = st.form_submit_button(label="Submit") if submit_button: if uploaded_file is None: raise ValueError("Please make sure to have uploaded a zip file.") submission = uploaded_file.name.split(".zip")[0] with st.spinner(f"Uploading {submission}..."): with zipfile.ZipFile(uploaded_file, "r") as zip_ref: zip_ref.extractall(submission_repo.local_dir) submission_repo.push_to_hub() with st.spinner(f"Computing XTREME-S Score for {submission}..."): results = {"name": submission} all_submitted_files = os.listdir(os.path.join(SUBMISSION_NAME, submission)) submitted_files = [f for f in all_submitted_files if f in EXPECTED_TEST_FILES] submitted_optional_files = [f for f in all_submitted_files if f in OPTIONAL_TEST_FILES] if sorted(EXPECTED_TEST_FILES) != sorted(submitted_files): raise ValueError( f"{', '.join(submitted_files)} were submitted, but expected {', '.join(EXPECTED_TEST_FILES)}" ) for file in submitted_files: ref_file = os.path.join(REFERENCE_NAME, file) pred_file = os.path.join(SUBMISSION_NAME, submission, file) test_set = file.split(".")[0] metric = METRIC_MAP[test_set] score = compute_score(pred_file, ref_file, metric) results[test_set] = round(100 * score, 2) # TODO: assessment of 'optional' test sets # XTREME-S score is computed over the mandatory test sets only average_score = ( 0.4 * (100 - (results["fleurs"] + results["mls"] + results["vp"]) / 3) + 0.4 * results["covost-2"] + 0.2 * (results["f-lid"] + results["m-14"]) / 2 ) results["average-score"] = round(average_score, 2) all_results = all_results.append(results, ignore_index=True) # save and upload new evaluated results all_results.to_csv(CSV_RESULTS_FILE, index=False) commit_url = submission_repo.push_to_hub() st.success("Please refresh this space (CTRL+R) to see your result")