File size: 8,016 Bytes
c5fe7df
 
 
 
 
 
 
 
 
 
 
bd61ad8
 
c5fe7df
 
923754b
c5fe7df
bd61ad8
 
c5fe7df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14dabac
bd61ad8
c5fe7df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bd61ad8
c5fe7df
 
 
 
 
 
 
bd61ad8
c5fe7df
 
 
 
 
 
 
 
9789f1a
c5fe7df
 
9789f1a
 
c5fe7df
 
9789f1a
c5fe7df
 
9789f1a
c5fe7df
 
 
 
 
 
bd61ad8
 
 
 
 
 
a1edac1
 
 
affaad2
a1edac1
9789f1a
bd61ad8
c5fe7df
bd61ad8
c5fe7df
 
3b18ca7
c5fe7df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
555dea6
c5fe7df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bd61ad8
c5fe7df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bd61ad8
c5fe7df
 
 
bd61ad8
 
 
 
 
c5fe7df
 
 
 
 
 
 
 
bd61ad8
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
import streamlit as st
from pandas import read_csv
import os
from evaluate import load
from huggingface_hub import Repository
import zipfile

# first define URLs for the reference and submission datasets on the Hub
REFERENCE_NAME = "references"
SUBMISSION_NAME = "submissions"

REFERENCE_URL = os.path.join("https://huggingface.co/datasets/xtreme-s", REFERENCE_NAME)
SUBMISSION_URL = os.path.join("https://huggingface.co/datasets/xtreme-s", SUBMISSION_NAME)

# grab these repos using the token provided
HF_TOKEN = os.environ.get("HF_TOKEN")

reference_repo = Repository(local_dir="references", clone_from=REFERENCE_URL, use_auth_token=HF_TOKEN)
submission_repo = Repository(local_dir="submissions", clone_from=SUBMISSION_URL, use_auth_token=HF_TOKEN)
submission_repo.git_pull()

all_submissions = [
    folder
    for folder in os.listdir(SUBMISSION_NAME)
    if os.path.isdir(os.path.join(SUBMISSION_NAME, folder)) and folder != ".git"
]

# define the XTREME-S test sets
TEST_SETS = [
    "fleurs",
    "mls",
    "vp",
    "covost-2",
    "f-lid",
    "m-14",
]

EXPECTED_TEST_FILES = [f + ".txt" for f in TEST_SETS]

# define the optional test sets - ignore for now
OPTIONAL_TEST_SETS = []  # ["f-r5"]
OPTIONAL_TEST_FILES = [f + ".txt" for f in OPTIONAL_TEST_SETS]

# load all metrics
wer_metric = load("wer")
bleu_metric = load("bleu")
acc_metric = load("accuracy")
f1_metric = load("f1")

# map test set to metric
METRIC_MAP = {
    "fleurs": wer_metric,
    "mls": wer_metric,
    "vp": wer_metric,
    "covost-2": bleu_metric,
    "f-lid": acc_metric,
    "m-14": f1_metric,
}


def compute_score(pred_file, ref_file, metric):
    """Assess predicted file against reference file for a given metric."""
    with open(pred_file, "r", encoding="utf-8") as pred, open(ref_file, "r", encoding="utf-8") as ref:
        # TODO: any post-processing required?
        pred_lines = [line.strip() for line in pred.readlines()]
        ref_lines = [line.strip() for line in ref.readlines()]

    score = metric(ref_lines, pred_lines)
    return score


# load up the results file
CSV_RESULTS_FILE = os.path.join(SUBMISSION_NAME, "results.csv")

all_results = read_csv(CSV_RESULTS_FILE)

# Write table form CSV
table = all_results.copy()

# make sure the column ordering is correct (name, fleurs, mls, ..., average-score)
average_column = table.pop("average-score")
name_column = table.pop("name")
num_columns = len(table.columns)
table.insert(num_columns, "average-score", average_column)
table = table.select_dtypes(exclude=["object", "string"])
table.insert(0, "name", name_column)
# sort by average score
table = table.sort_values(by=["average-score"], ascending=False, ignore_index=True)
table = table.round(2)
# start ranking from index 1
table.index = table.index + 1

# Streamlit
st.markdown("# XTREME-S: Evaluating Cross-lingual Speech Representations")

st.markdown(
    "This is the leaderboard for the XTREME-S benchmark. Submitted systems are ranked by the **average score**, which"
    " is a weighted average of the mandatory test sets:"
)
# hacky way of getting math-mode to render
st.write(
    r"""
    $$
    \begin{gathered}
    0.4 *\left(100-\frac{\text{Fleurs}+\text{MLS}+\text{VP}}{3}\right)_{(\mathrm{WER})}+ \\
    0.4 * \text{CoVoST-2}_{(\mathrm{BLEU})}+0.2 *\left(\frac{\text{F-LID}+\text{M-14}}{2}\right)_{(\mathrm{Acc})}
    \end{gathered}
    $$
    """
)
st.markdown("The optional dataset of f-r5 does not contribute to the average score.")

# st.table(table)
st.dataframe(table.style.format(subset=["average-score", *TEST_SETS, *OPTIONAL_TEST_SETS], formatter="{:.1f}"))

st.markdown(
    """
    XTREME-S was proposed in *XTREME-S: Evaluating Cross-lingual Speech Representations*, by Conneau et. al.
    \n
    The abstract of the paper is as follows:
    \n
    *We introduce XTREME-S, a new benchmark to evaluate universal cross-lingual speech representations in many languages. XTREME-S covers four task families: speech recognition, classification, speech-to-text translation and retrieval. Covering 102 languages from 10+ language families, 3 different domains and 4 task families, XTREME-S aims to simplify multilingual speech representation evaluation, as well as catalyze research in "universal" speech representation learning. This paper describes the new benchmark and establishes the first speech-only and speech-text baselines using XLS-R and mSLAM on all downstream tasks. We motivate the design choices and detail how to use the benchmark.*
    \n
    For more information, refer to the paper submission on [Arxiv](https://arxiv.org/abs/2203.10752).
    """
)

st.markdown(
    """
    ## Submitting to XTREME-S
    \n
    To submit to XTREME-S, download the audio data for the mandatory XTREME-S test sets from [xtreme-s/datasets](https://huggingface.co/datasets/xtreme-s/datasets). The test sets contain audio data only. Evaluate your system on the six test sets by generating predictions for the unlabelled audio samples. For each test set, save the predictions to a .txt file in the order that the audio samples are provided, with one prediction per line. Name the .txt file according to the XTREME-S test set names shown in the table (e.g. the predictions for FLEURS should be named fleurs.txt).
    \n
    Once you have evaluated your system on all of the six mandatory test sets, move the predictions into one folder and zip it. The name you assign to the zipped folder will be the name that is shown on the table (e.g. mSLAM.zip will be displayed as mSLAM). Upload your zipped submissions for scoring and placement on the leaderboard. 
    \n
    Should you experience any issues, open an issue using the link [new discussion](https://huggingface.co/spaces/xtreme-s/leaderboard/discussions/new) and tag `@sanchit-gandhi`.
 """
)

# Using the "with" syntax
with st.form(key="my_form"):
    uploaded_file = st.file_uploader("Choose a zip file")
    submit_button = st.form_submit_button(label="Submit")

if submit_button:
    if uploaded_file is None:
        raise ValueError("Please make sure to have uploaded a zip file.")

    submission = uploaded_file.name.split(".zip")[0]
    with st.spinner(f"Uploading {submission}..."):
        with zipfile.ZipFile(uploaded_file, "r") as zip_ref:
            zip_ref.extractall(submission_repo.local_dir)
            submission_repo.push_to_hub()

    with st.spinner(f"Computing XTREME-S Score for {submission}..."):
        results = {"name": submission}
        all_submitted_files = os.listdir(os.path.join(SUBMISSION_NAME, submission))

        submitted_files = [f for f in all_submitted_files if f in EXPECTED_TEST_FILES]
        submitted_optional_files = [f for f in all_submitted_files if f in OPTIONAL_TEST_FILES]

        if sorted(EXPECTED_TEST_FILES) != sorted(submitted_files):
            raise ValueError(
                f"{', '.join(submitted_files)} were submitted, but expected {', '.join(EXPECTED_TEST_FILES)}"
            )

        for file in submitted_files:
            ref_file = os.path.join(REFERENCE_NAME, file)
            pred_file = os.path.join(SUBMISSION_NAME, submission, file)

            test_set = file.split(".")[0]
            metric = METRIC_MAP[test_set]

            score = compute_score(pred_file, ref_file, metric)
            results[test_set] = round(100 * score, 2)

        # TODO: assessment of 'optional' test sets

        # XTREME-S score is computed over the mandatory test sets only
        average_score = (
            0.4 * (100 - (results["fleurs"] + results["mls"] + results["vp"]) / 3)
            + 0.4 * results["covost-2"]
            + 0.2 * (results["f-lid"] + results["m-14"]) / 2
        )
        results["average-score"] = round(average_score, 2)

        all_results = all_results.append(results, ignore_index=True)

        # save and upload new evaluated results
        all_results.to_csv(CSV_RESULTS_FILE, index=False)
        commit_url = submission_repo.push_to_hub()

    st.success("Please refresh this space (CTRL+R) to see your result")