Spaces:
Runtime error
Runtime error
esc-bencher
commited on
Commit
•
0b5dbbb
1
Parent(s):
2b35b5a
Create new file
Browse files
app.py
ADDED
@@ -0,0 +1,167 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from pandas import read_csv
|
3 |
+
import os
|
4 |
+
import jiwer
|
5 |
+
from huggingface_hub import Repository
|
6 |
+
import zipfile
|
7 |
+
|
8 |
+
REFERENCE_NAME = "references"
|
9 |
+
SUBMISSION_NAME = "submissions"
|
10 |
+
|
11 |
+
REFERENCE_URL = os.path.join(
|
12 |
+
"https://huggingface.co/datasets/esc-benchmark", REFERENCE_NAME
|
13 |
+
)
|
14 |
+
SUBMISSION_URL = os.path.join(
|
15 |
+
"https://huggingface.co/datasets/esc-benchmark", SUBMISSION_NAME
|
16 |
+
)
|
17 |
+
|
18 |
+
TEST_SETS = [
|
19 |
+
"librispeech-clean",
|
20 |
+
"librispeech-other",
|
21 |
+
"common-voice-9",
|
22 |
+
"vox-populi",
|
23 |
+
"ted-lium",
|
24 |
+
"giga-speech",
|
25 |
+
"spgi-speech",
|
26 |
+
"earnings-22",
|
27 |
+
"ami",
|
28 |
+
]
|
29 |
+
EXPECTED_TEST_FILES = [f + ".txt" for f in TEST_SETS]
|
30 |
+
OPTIONAL_TEST_SETS = ["switch-board", "call-home", "chime-4"]
|
31 |
+
|
32 |
+
CSV_RESULTS_FILE = os.path.join(SUBMISSION_NAME, "results.csv")
|
33 |
+
|
34 |
+
|
35 |
+
HF_TOKEN = os.environ.get("HF_TOKEN")
|
36 |
+
|
37 |
+
|
38 |
+
def compute_wer(pred_file, ref_file):
|
39 |
+
with open(pred_file, "r", encoding="utf-8") as pred, open(
|
40 |
+
ref_file, "r", encoding="utf-8"
|
41 |
+
) as ref:
|
42 |
+
pred_lines = [line.strip() for line in pred.readlines()]
|
43 |
+
ref_lines = [line.strip() for line in ref.readlines()]
|
44 |
+
|
45 |
+
wer = jiwer.wer(ref_lines, pred_lines)
|
46 |
+
return wer
|
47 |
+
|
48 |
+
|
49 |
+
reference_repo = Repository(
|
50 |
+
local_dir="references", clone_from=REFERENCE_URL, use_auth_token=HF_TOKEN
|
51 |
+
)
|
52 |
+
submission_repo = Repository(
|
53 |
+
local_dir="submissions", clone_from=SUBMISSION_URL, use_auth_token=HF_TOKEN
|
54 |
+
)
|
55 |
+
submission_repo.git_pull()
|
56 |
+
|
57 |
+
all_submissions = [
|
58 |
+
folder
|
59 |
+
for folder in os.listdir(SUBMISSION_NAME)
|
60 |
+
if os.path.isdir(os.path.join(SUBMISSION_NAME, folder)) and folder != ".git"
|
61 |
+
]
|
62 |
+
|
63 |
+
|
64 |
+
COLUMN_NAMES = {
|
65 |
+
"librispeech-clean": "ls-clean",
|
66 |
+
"librispeech-other": "ls-other",
|
67 |
+
"common-voice-9": "cv9",
|
68 |
+
"vox-populi": "vox",
|
69 |
+
"ted-lium": "ted",
|
70 |
+
"giga-speech": "giga",
|
71 |
+
"spgi-speech": "spgi",
|
72 |
+
"earnings-22": "e22",
|
73 |
+
"ami": "ami",
|
74 |
+
"chime-4": "chime",
|
75 |
+
"switch-board": "swbd",
|
76 |
+
"call-home": "ch",
|
77 |
+
}
|
78 |
+
|
79 |
+
all_results = read_csv(CSV_RESULTS_FILE)
|
80 |
+
|
81 |
+
|
82 |
+
# Write table form CSV
|
83 |
+
table = all_results.copy()
|
84 |
+
|
85 |
+
esc_column = table.pop("esc-score")
|
86 |
+
name_column = table.pop("name")
|
87 |
+
table.insert(0, "esc-score", esc_column)
|
88 |
+
# TODO: revert to scaling raw WER by 100 to retrieve % point values
|
89 |
+
table = table.select_dtypes(exclude=['object', 'string']) # * 100
|
90 |
+
table.insert(0, "name", name_column)
|
91 |
+
table = table.round(2)
|
92 |
+
table = table.rename(columns=COLUMN_NAMES)
|
93 |
+
# start indexing from 1
|
94 |
+
table.index = table.index + 1
|
95 |
+
|
96 |
+
# Streamlit
|
97 |
+
st.markdown("# ESC: A Benchmark For Multi-Domain End-to-End Speech Recognition")
|
98 |
+
|
99 |
+
st.markdown(
|
100 |
+
f"""
|
101 |
+
This is the leaderboard of the End-to end Speech Challenge (ESC).
|
102 |
+
Submitted systems are ranked by the **ESC Score** which is the average of
|
103 |
+
all non-optional datasets: {', '.join(COLUMN_NAMES.values())}."""
|
104 |
+
)
|
105 |
+
|
106 |
+
# st.table(table)
|
107 |
+
st.dataframe(table.style.format(subset=["esc-score", *[COLUMN_NAMES[k] for k in COLUMN_NAMES]], formatter="{:.1f}"))
|
108 |
+
|
109 |
+
st.markdown(
|
110 |
+
"""
|
111 |
+
ESC was proposed in *ESC: A Benchmark For Multi-Domain End-to-End Speech Recognition* by ...
|
112 |
+
\n
|
113 |
+
The abstract of the paper is as follows:
|
114 |
+
\n
|
115 |
+
*Speech recognition applications cover a range of different audio and text distributions, with different speaking styles, background noise, transcription punctuation and character casing. However, many speech recognition systems require dataset-specific tuning (audio filtering, punctuation removal and normalisation of casing), therefore assuming a-priori knowledge of both the audio and text distributions. This tuning requirement can lead to systems failing to generalise to other datasets and domains. To promote the development of multi-domain speech systems, we introduce the End-to end Speech Challenge (ESC) for evaluating the performance of a single automatic speech recognition (ASR) system across a broad set of speech datasets. Benchmarked systems must use the same data pre- and post-processing algorithm across datasets - assuming the audio and text data distributions are a-priori unknown. We compare a series of state-of-the-art (SoTA) end-to-end (E2E) systems on this benchmark, demonstrating how a single speechsystem can be applied and evaluated on a wide range of data distributions. We find E2E systems to be effective across datasets: in a fair comparison, E2E systems achieve within 2.6% of SoTA systems tuned to a specific dataset. Our analysis reveals that transcription artefacts, such as punctuation and casing, pose difficulties for ASR systems and should be included in evaluation. We believe E2E benchmarking over a range of datasets promotes the research of multi-domain speech recognition systems.*
|
116 |
+
\n
|
117 |
+
For more information, please see the official submission on [OpenReview.net](https://openreview.net/forum?id=9OL2fIfDLK).
|
118 |
+
"""
|
119 |
+
)
|
120 |
+
|
121 |
+
st.markdown("To submit to ESC, download the audio data for the nine mandatory ESC test sets from [esc-datasets](https://huggingface.co/datasets/esc-benchmark/esc-datasets). The test sets contain audio data only. Evaluate your system on the nine test sets by generating predictions for the unlabelled audio samples. For each test set, save the predictions in the order that the audio samples are provided in a .txt file, with one prediction per line. Name the .txt file according to the ESC test set names shown in the table (e.g. the predictions for LibriSpeech test-clean should be named ls-clean.txt). Once you have evaluated your system on all nine test sets, move the predictions into one folder and zip it. The name you assign to the folder will be the name that is shown on the table (e.g. whisper-aed.zip will be displayed as whisper-aed)."
|
122 |
+
)
|
123 |
+
|
124 |
+
# Using the "with" syntax
|
125 |
+
with st.form(key="my_form"):
|
126 |
+
uploaded_file = st.file_uploader("Choose a zip file")
|
127 |
+
submit_button = st.form_submit_button(label="Submit")
|
128 |
+
|
129 |
+
if submit_button:
|
130 |
+
if uploaded_file is None:
|
131 |
+
raise ValueError("Please make sure to have uploaded a zip file.")
|
132 |
+
|
133 |
+
submission = uploaded_file.name.split(".zip")[0]
|
134 |
+
with st.spinner(f"Uploading {submission}..."):
|
135 |
+
with zipfile.ZipFile(uploaded_file, 'r') as zip_ref:
|
136 |
+
zip_ref.extractall(submission_repo.local_dir)
|
137 |
+
submission_repo.push_to_hub()
|
138 |
+
|
139 |
+
with st.spinner(f"Computing ESC Score for {submission}..."):
|
140 |
+
results = {"name": submission}
|
141 |
+
submitted_files = os.listdir(os.path.join(SUBMISSION_NAME, submission))
|
142 |
+
|
143 |
+
submitted_files = [f for f in submitted_files if f in EXPECTED_TEST_FILES]
|
144 |
+
|
145 |
+
if sorted(EXPECTED_TEST_FILES) != sorted(submitted_files):
|
146 |
+
raise ValueError(
|
147 |
+
f"{', '.join(submitted_files)} were submitted, but expected {', '.join(EXPECTED_TEST_FILES)}"
|
148 |
+
)
|
149 |
+
|
150 |
+
for file in submitted_files:
|
151 |
+
ref_file = os.path.join(REFERENCE_NAME, file)
|
152 |
+
pred_file = os.path.join(SUBMISSION_NAME, submission, file)
|
153 |
+
|
154 |
+
wer = compute_wer(pred_file, ref_file)
|
155 |
+
results[file.split(".")[0]] = str(wer)
|
156 |
+
|
157 |
+
wer_values = [float(results[t]) for t in TEST_SETS]
|
158 |
+
all_wer = sum(wer_values) / len(wer_values)
|
159 |
+
|
160 |
+
results["esc-score"] = all_wer
|
161 |
+
all_results = all_results.append(results, ignore_index=True)
|
162 |
+
|
163 |
+
# save and upload new evaluated results
|
164 |
+
all_results.to_csv(CSV_RESULTS_FILE)
|
165 |
+
commit_url = submission_repo.push_to_hub()
|
166 |
+
|
167 |
+
st.success('Please refresh this space (CTRL+R) to see your result')
|