zhuohan-7 commited on
Commit
4c1d731
1 Parent(s): 6d54304

Upload folder using huggingface_hub

Browse files
Files changed (4) hide show
  1. app/content.py +1 -1
  2. app/draw_diagram.py +120 -150
  3. app/pages.py +45 -107
  4. app/show_examples.py +23 -58
app/content.py CHANGED
@@ -62,7 +62,7 @@ cnasr_datasets = {
62
  }
63
 
64
  metrics = {
65
- 'wer': 'Word Error Rate (WER), a common metric for ASR evaluation. (The lower the better)',
66
  'llama3_70b_judge_binary': 'Binary evaluation using the LLAMA3-70B model, for tasks requiring a binary outcome. (0-100 based on score 0-1)',
67
  'llama3_70b_judge': 'General evaluation using the LLAMA3-70B model, typically scoring based on subjective judgments. (0-100 based on score 0-5)',
68
  'meteor': 'METEOR, a metric used for evaluating text generation, often used in translation or summarization tasks. (Sensitive to output length)',
 
62
  }
63
 
64
  metrics = {
65
+ 'wer': 'Word Error Rate (WER), a common metric for ASR evaluation. (The lower, the better)',
66
  'llama3_70b_judge_binary': 'Binary evaluation using the LLAMA3-70B model, for tasks requiring a binary outcome. (0-100 based on score 0-1)',
67
  'llama3_70b_judge': 'General evaluation using the LLAMA3-70B model, typically scoring based on subjective judgments. (0-100 based on score 0-5)',
68
  'meteor': 'METEOR, a metric used for evaluating text generation, often used in translation or summarization tasks. (Sensitive to output length)',
app/draw_diagram.py CHANGED
@@ -7,12 +7,20 @@ from streamlit.components.v1 import html
7
  from app.show_examples import *
8
  import pandas as pd
9
 
 
 
 
10
  # huggingface_image = Image.open('style/huggingface.jpg')
11
 
12
  # other info
13
- #path = "./AudioBench-Leaderboard/additional_info/Leaderboard-Rename.xlsx"
14
- path = "./additional_info/Leaderboard-Rename.xlsx"
15
- info_df = pd.read_excel(path)
 
 
 
 
 
16
 
17
  # def nav_to(value):
18
  # try:
@@ -26,11 +34,6 @@ def draw(folder_name, category_name, dataset_name, metrics, cus_sort=True):
26
 
27
  folder = f"./results/{metrics}/"
28
 
29
- display_names = {
30
- 'SU': 'Speech Understanding',
31
- 'ASU': 'Audio Scene Understanding',
32
- 'VU': 'Voice Understanding'
33
- }
34
 
35
  data_path = f'{folder}/{category_name.lower()}.csv'
36
  chart_data = pd.read_csv(data_path).round(3)
@@ -50,8 +53,9 @@ def draw(folder_name, category_name, dataset_name, metrics, cus_sort=True):
50
  """, unsafe_allow_html=True)
51
 
52
  # remap model names
53
- display_model_names = {key.strip() :val.strip() for key, val in zip(info_df['AudioBench'], info_df['Proper Display Name'])}
54
- chart_data['model_show'] = chart_data['Model'].map(display_model_names)
 
55
 
56
  models = st.multiselect("Please choose the model",
57
  sorted(chart_data['model_show'].tolist()),
@@ -61,86 +65,17 @@ def draw(folder_name, category_name, dataset_name, metrics, cus_sort=True):
61
  chart_data = chart_data[chart_data['model_show'].isin(models)]
62
  chart_data = chart_data.sort_values(by=[new_dataset_name], ascending=cus_sort).dropna(axis=0)
63
 
64
- if len(chart_data) == 0:
65
- return
66
 
67
- # Get Values
68
- data_values = chart_data.iloc[:, 1]
69
-
70
- # Calculate Q1 and Q3
71
- q1 = data_values.quantile(0.25)
72
- q3 = data_values.quantile(0.75)
73
-
74
- # Calculate IQR
75
- iqr = q3 - q1
76
-
77
- # Define lower and upper bounds (1.5*IQR is a common threshold)
78
- lower_bound = q1 - 1.5 * iqr
79
- upper_bound = q3 + 1.5 * iqr
80
-
81
- # Filter data within the bounds
82
- filtered_data = data_values[(data_values >= lower_bound) & (data_values <= upper_bound)]
83
-
84
- # Calculate min and max values after outlier handling
85
- min_value = round(filtered_data.min() - 0.1 * filtered_data.min(), 3)
86
- max_value = round(filtered_data.max() + 0.1 * filtered_data.max(), 3)
87
-
88
- options = {
89
- #"title": {"text": f"{display_names[folder_name.upper()]}"},
90
- "title": {"text": f"{dataset_name}"},
91
- "tooltip": {
92
- "trigger": "axis",
93
- "axisPointer": {"type": "cross", "label": {"backgroundColor": "#6a7985"}},
94
- "triggerOn": 'mousemove',
95
- },
96
- "legend": {"data": ['Overall Accuracy']},
97
- "toolbox": {"feature": {"saveAsImage": {}}},
98
- "grid": {"left": "3%", "right": "4%", "bottom": "3%", "containLabel": True},
99
- "xAxis": [
100
- {
101
- "type": "category",
102
- "boundaryGap": True,
103
- "triggerEvent": True,
104
- "data": chart_data['model_show'].tolist(),
105
- }
106
- ],
107
- "yAxis": [{"type": "value",
108
- "min": min_value,
109
- "max": max_value,
110
- "boundaryGap": True
111
- # "splitNumber": 10
112
- }],
113
- "series": [{
114
- "name": f"{dataset_name}",
115
- "type": "bar",
116
- "data": chart_data[f'{new_dataset_name}'].tolist(),
117
- }],
118
- }
119
-
120
- events = {
121
- "click": "function(params) { return params.value }"
122
- }
123
 
124
- value = st_echarts(options=options, events=events, height="500px")
125
-
126
- # if value != None:
127
- # # print(value)
128
- # nav_to(value)
129
-
130
- # if value != None:
131
- # highlight_table_line(value)
132
 
 
133
  '''
134
- Show table
135
  '''
136
- # st.divider()
137
  with st.container():
138
- # st.write("")
139
- st.markdown('##### Results')
140
- # custom_css = """
141
-
142
- # """
143
- # st.markdown(custom_css, unsafe_allow_html=True)
144
 
145
  model_link = {key.strip(): val for key, val in zip(info_df['Proper Display Name'], info_df['Link'])}
146
 
@@ -148,6 +83,9 @@ def draw(folder_name, category_name, dataset_name, metrics, cus_sort=True):
148
 
149
  chart_data_table = chart_data[['model_show', chart_data.columns[1], chart_data.columns[3]]]
150
 
 
 
 
151
  cur_dataset_name = chart_data_table.columns[1]
152
 
153
  if cur_dataset_name in [
@@ -162,7 +100,6 @@ def draw(folder_name, category_name, dataset_name, metrics, cus_sort=True):
162
  'tedlium3_long_form_test',
163
  'imda_part1_asr_test',
164
  'imda_part2_asr_test',
165
-
166
  'aishell_asr_zh_test',
167
  ]:
168
 
@@ -187,10 +124,6 @@ def draw(folder_name, category_name, dataset_name, metrics, cus_sort=True):
187
  chart_data_table.columns[1]: {'alignment': 'left'},
188
  "model_link": st.column_config.LinkColumn(
189
  "Model Link",
190
- # # # help="",
191
- # validate=r"^https://(.*?)$",
192
- # # max_chars=100,
193
- # display_text=r"\[(.*?)\]"
194
  ),
195
  },
196
  hide_index=True,
@@ -198,68 +131,105 @@ def draw(folder_name, category_name, dataset_name, metrics, cus_sort=True):
198
  )
199
 
200
 
201
-
 
 
 
202
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
203
 
204
- # s = ''
205
- # for model in models:
206
- # try:
207
- # # <td align="center"><input type="checkbox" name="select"></td>
208
- # s += f"""<tr>
209
- # <td><a href={model_link[model]}>{model}</a></td>
210
- # <td>{chart_data[chart_data['Model'] == model][new_dataset_name].tolist()[0]}</td>
211
- # </tr>"""
212
- # except:
213
- # # print(f"{model} is not in {dataset_name}")
214
- # continue
215
-
216
- # # select all function
217
- # select_all_function = """<script>
218
- # function toggle(source) {
219
- # var checkboxes = document.querySelectorAll('input[type="checkbox"]');
220
- # for (var i = 0; i < checkboxes.length; i++) {
221
- # if (checkboxes[i] != source)
222
- # checkboxes[i].checked = source.checked;
223
- # }
224
- # }
225
- # </script>"""
226
- # st.markdown(f"""
227
- # <div class="select_all">{select_all_function}</div>
228
- # """, unsafe_allow_html=True)
229
-
230
- # info_body_details = f"""
231
- # <table style="width:80%">
232
- # <thead>
233
- # <tr style="text-align: center;">
234
- # <th style="width:45%">MODEL</th>
235
- # <th style="width:45%">{dataset_name}</th>
236
- # </tr>
237
- # {s}
238
- # </thead>
239
- # </table>
240
- # """
241
- # #<th style="width:10%"><input type="checkbox" onclick="toggle(this);"></th>
242
- # # html_code = custom_css + select_all_function + info_body_details
243
- # # html(html_code, height = 300)
244
-
245
- # st.markdown(f"""
246
- # <div class="my-data-table">{info_body_details}</div>
247
- # """, unsafe_allow_html=True)
248
-
249
-
250
- # st.dataframe(chart_data,
251
- # # column_config = {
252
- # # "Link": st.column_config.LinkColumn(
253
- # # display_text= st.image(huggingface_image)
254
- # # ),
255
- # # },
256
- # hide_index = True,
257
- # use_container_width=True)
258
  '''
259
- show samples
260
  '''
261
- if dataset_name in ['Earnings21-Test', 'Earnings22-Test', 'Tedlium3-Test', 'Tedlium3-Long-form-Test']:
262
- pass
263
- else:
264
- show_examples(category_name, dataset_name, chart_data['Model'].tolist(), display_model_names)
265
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  from app.show_examples import *
8
  import pandas as pd
9
 
10
+ from model_information import get_dataframe
11
+
12
+
13
  # huggingface_image = Image.open('style/huggingface.jpg')
14
 
15
  # other info
16
+ # path = "./AudioBench-Leaderboard/additional_info/Leaderboard-Rename.xlsx"
17
+ # path = "./additional_info/Leaderboard-Rename.xlsx"
18
+
19
+ # info_df = pd.read_excel(path)
20
+
21
+ info_df = get_dataframe()
22
+
23
+
24
 
25
  # def nav_to(value):
26
  # try:
 
34
 
35
  folder = f"./results/{metrics}/"
36
 
 
 
 
 
 
37
 
38
  data_path = f'{folder}/{category_name.lower()}.csv'
39
  chart_data = pd.read_csv(data_path).round(3)
 
53
  """, unsafe_allow_html=True)
54
 
55
  # remap model names
56
+ display_model_names = {key.strip() :val.strip() for key, val in zip(info_df['Original Name'], info_df['Proper Display Name'])}
57
+ chart_data['model_show'] = chart_data['Model'].map(lambda x: display_model_names.get(x, x))
58
+
59
 
60
  models = st.multiselect("Please choose the model",
61
  sorted(chart_data['model_show'].tolist()),
 
65
  chart_data = chart_data[chart_data['model_show'].isin(models)]
66
  chart_data = chart_data.sort_values(by=[new_dataset_name], ascending=cus_sort).dropna(axis=0)
67
 
68
+ if len(chart_data) == 0: return
 
69
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
 
 
 
 
 
 
 
 
 
71
 
72
+ # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
73
  '''
74
+ Show Table
75
  '''
 
76
  with st.container():
77
+ st.markdown('##### TABLE')
78
+
 
 
 
 
79
 
80
  model_link = {key.strip(): val for key, val in zip(info_df['Proper Display Name'], info_df['Link'])}
81
 
 
83
 
84
  chart_data_table = chart_data[['model_show', chart_data.columns[1], chart_data.columns[3]]]
85
 
86
+ # Format numeric columns to 2 decimal places
87
+ chart_data_table[chart_data_table.columns[1]] = chart_data_table[chart_data_table.columns[1]].apply(lambda x: f"{x:.3f}" if isinstance(x, (int, float)) else x)
88
+
89
  cur_dataset_name = chart_data_table.columns[1]
90
 
91
  if cur_dataset_name in [
 
100
  'tedlium3_long_form_test',
101
  'imda_part1_asr_test',
102
  'imda_part2_asr_test',
 
103
  'aishell_asr_zh_test',
104
  ]:
105
 
 
124
  chart_data_table.columns[1]: {'alignment': 'left'},
125
  "model_link": st.column_config.LinkColumn(
126
  "Model Link",
 
 
 
 
127
  ),
128
  },
129
  hide_index=True,
 
131
  )
132
 
133
 
134
+ # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
135
+ '''
136
+ Show Chart
137
+ '''
138
 
139
+ # Initialize a session state variable for toggling the chart visibility
140
+ if "show_chart" not in st.session_state:
141
+ st.session_state.show_chart = False
142
+
143
+ # Create a button to toggle visibility
144
+ if st.button("Show Chart"):
145
+ st.session_state.show_chart = not st.session_state.show_chart
146
+
147
+ if st.session_state.show_chart:
148
+
149
+ with st.container():
150
+ st.markdown('##### CHART')
151
+
152
+ # Get Values
153
+ data_values = chart_data.iloc[:, 1]
154
+
155
+ # Calculate Q1 and Q3
156
+ q1 = data_values.quantile(0.25)
157
+ q3 = data_values.quantile(0.75)
158
+
159
+ # Calculate IQR
160
+ iqr = q3 - q1
161
+
162
+ # Define lower and upper bounds (1.5*IQR is a common threshold)
163
+ lower_bound = q1 - 1.5 * iqr
164
+ upper_bound = q3 + 1.5 * iqr
165
+
166
+ # Filter data within the bounds
167
+ filtered_data = data_values[(data_values >= lower_bound) & (data_values <= upper_bound)]
168
+
169
+ # Calculate min and max values after outlier handling
170
+ min_value = round(filtered_data.min() - 0.1 * filtered_data.min(), 3)
171
+ max_value = round(filtered_data.max() + 0.1 * filtered_data.max(), 3)
172
+
173
+ options = {
174
+ # "title": {"text": f"{dataset_name}"},
175
+ "tooltip": {
176
+ "trigger": "axis",
177
+ "axisPointer": {"type": "cross", "label": {"backgroundColor": "#6a7985"}},
178
+ "triggerOn": 'mousemove',
179
+ },
180
+ "legend": {"data": ['Overall Accuracy']},
181
+ "toolbox": {"feature": {"saveAsImage": {}}},
182
+ "grid": {"left": "3%", "right": "4%", "bottom": "3%", "containLabel": True},
183
+ "xAxis": [
184
+ {
185
+ "type": "category",
186
+ "boundaryGap": True,
187
+ "triggerEvent": True,
188
+ "data": chart_data['model_show'].tolist(),
189
+ }
190
+ ],
191
+ "yAxis": [{"type": "value",
192
+ "min": min_value,
193
+ "max": max_value,
194
+ "boundaryGap": True
195
+ # "splitNumber": 10
196
+ }],
197
+ "series": [{
198
+ "name": f"{dataset_name}",
199
+ "type": "bar",
200
+ "data": chart_data[f'{new_dataset_name}'].tolist(),
201
+ }],
202
+ }
203
+
204
+ events = {
205
+ "click": "function(params) { return params.value }"
206
+ }
207
+
208
+ value = st_echarts(options=options, events=events, height="500px")
209
+
210
+
211
+
212
+
213
+ # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
214
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
215
  '''
216
+ Show Examples
217
  '''
218
+
219
+
220
+ # Initialize a session state variable for toggling the chart visibility
221
+ if "show_examples" not in st.session_state:
222
+ st.session_state.show_examples = False
223
+
224
+ # Create a button to toggle visibility
225
+ if st.button("Show Examples"):
226
+ st.session_state.show_examples = not st.session_state.show_examples
227
+
228
+ if st.session_state.show_examples:
229
+
230
+ # if dataset_name in ['Earnings21-Test', 'Earnings22-Test', 'Tedlium3-Test', 'Tedlium3-Long-form-Test']:
231
+ if dataset_name in []:
232
+ pass
233
+ else:
234
+ show_examples(category_name, dataset_name, chart_data['Model'].tolist(), display_model_names)
235
+
app/pages.py CHANGED
@@ -9,8 +9,8 @@ def dataset_contents(dataset, metrics):
9
  .my-dataset-info {
10
  # background-color: #F9EBEA;
11
  # padding: 10px;
12
- color: #626567;
13
- font-style: italic;
14
  font-size: 8px;
15
  height: auto;
16
  }
@@ -18,10 +18,10 @@ def dataset_contents(dataset, metrics):
18
  """
19
  st.markdown(custom_css, unsafe_allow_html=True)
20
  st.markdown(f"""<div class="my-dataset-info">
21
- <p><b>Dataset Information</b>: {dataset}</p>
22
  </div>""", unsafe_allow_html=True)
23
  st.markdown(f"""<div class="my-dataset-info">
24
- <p><b>Metric Information</b>: {metrics}</p>
25
  </div>""", unsafe_allow_html=True)
26
 
27
 
@@ -38,12 +38,16 @@ def dashboard():
38
 
39
  audio_url = "https://arxiv.org/abs/2406.16020"
40
 
 
 
 
 
41
  st.divider()
42
- st.markdown("#### [AudioBench](%s)" % audio_url)
43
- st.markdown("##### :dizzy: A comprehensive evaluation benchmark designed for general instruction-following audiolanguage models")
 
 
44
  st.markdown('''
45
-
46
-
47
  ''')
48
 
49
  with st.container():
@@ -51,7 +55,8 @@ def dashboard():
51
  with center_co:
52
  st.image("./style/audio_overview.png",
53
  caption="Overview of the datasets in AudioBench.",
54
- use_column_width = True)
 
55
 
56
  st.markdown('''
57
 
@@ -60,21 +65,9 @@ def dashboard():
60
 
61
  st.markdown("###### :dart: Our Benchmark includes: ")
62
  cols = st.columns(10)
63
- cols[1].metric(label="Tasks", value="8") #delta="Tasks", delta_color="off"
64
- cols[2].metric(label="Datasets", value="26")
65
- cols[3].metric(label="Test Models", value="5")
66
-
67
- # st.markdown("###### :dart: Supported Models and Datasets: ")
68
-
69
- # sup = pd.DataFrame(
70
- # {"Dataset": "LibriSpeech-Clean",
71
- # "Category": st.selectbox('category', ['Speech Understanding']),
72
- # "Task": st.selectbox('task', ['Automatic Speech Recognition']),
73
- # "Metrics": st.selectbox('metrics', ['WER']),
74
- # "Status":True}
75
- # )
76
-
77
- # st.data_editor(sup, num_rows="dynamic")
78
 
79
 
80
  st.divider()
@@ -92,7 +85,7 @@ def dashboard():
92
  ''')
93
 
94
  def asr():
95
- st.title("Automatic Speech Recognition")
96
 
97
  filters_levelone = ['LibriSpeech-Test-Clean',
98
  'LibriSpeech-Test-Other',
@@ -103,41 +96,23 @@ def asr():
103
  'Earnings22-Test',
104
  'Tedlium3-Test',
105
  'Tedlium3-Long-form-Test',
106
- 'IMDA-Part1-ASR-Test',
107
- 'IMDA-Part2-ASR-Test']
 
108
 
109
  left, center, _, middle,right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
110
 
111
  with left:
112
- filter_1 = st.selectbox('Select Dataset', filters_levelone)
113
 
114
- # with middle:
115
- # if filter_1 == filters_levelone[0]:
116
- # sort_leveltwo = ['LibriSpeech-Test-Clean', 'LibriSpeech-Test-Other', 'Common-Voice-15-En-Test', 'Peoples-Speech-Test',
117
- # 'GigaSpeech-Test', 'Tedlium3-Test','Tedlium3-Long-form-Test', 'Earning-21-Test', 'Earning-22-Test']
118
- # elif filter_1 == filters_levelone[1]:
119
- # sort_leveltwo = ['CN-College-Listen-Test', 'SLUE-P2-SQA5-Test', 'DREAM-TTS-Test', 'Public-SG-SpeechQA-Test']
120
-
121
- # elif filter_1 == filters_levelone[2]:
122
- # sort_leveltwo = ['OpenHermes-Audio-Test', 'ALPACA-Audio-Test']
123
-
124
- # sort = st.selectbox("Sort Dataset", sort_leveltwo)
125
-
126
- # with right:
127
- # sorted = st.selectbox('by', ['Ascending', 'Descending'])
128
-
129
  if filter_1:
130
  dataset_contents(asr_datsets[filter_1], metrics['wer'])
131
  draw('su', 'ASR', filter_1, 'wer', cus_sort=True)
132
- # else:
133
- # draw('su', 'ASR', 'LibriSpeech-Test-Clean', 'wer')
134
 
135
-
136
- ## examples
137
 
138
 
139
  def sqa():
140
- st.title("Speech Question Answering")
141
 
142
  binary = ['CN-College-Listen-MCQ-Test', 'DREAM-TTS-MCQ-Test']
143
 
@@ -150,7 +125,7 @@ def sqa():
150
  left, center, _, middle,right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
151
 
152
  with left:
153
- filter_1 = st.selectbox('Select Dataset', filters_levelone)
154
 
155
  if filter_1:
156
  if filter_1 in binary:
@@ -160,11 +135,9 @@ def sqa():
160
  else:
161
  dataset_contents(sqa_datasets[filter_1], metrics['llama3_70b_judge'])
162
  draw('su', 'SQA', filter_1, 'llama3_70b_judge')
163
- # else:
164
- # draw('su', 'SQA', 'CN-College-Listen-Test', 'llama3_70b_judge_binary')
165
 
166
  def si():
167
- st.title("Speech Question Answering")
168
 
169
  filters_levelone = ['OpenHermes-Audio-Test',
170
  'ALPACA-Audio-Test']
@@ -172,16 +145,14 @@ def si():
172
  left, center, _, middle,right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
173
 
174
  with left:
175
- filter_1 = st.selectbox('Select Dataset', filters_levelone)
176
 
177
  if filter_1:
178
  dataset_contents(si_datasets[filter_1], metrics['llama3_70b_judge'])
179
  draw('su', 'SI', filter_1, 'llama3_70b_judge')
180
- # else:
181
- # draw('su', 'SI', 'OpenHermes-Audio-Test', 'llama3_70b_judge')
182
 
183
  def ac():
184
- st.title("Audio Captioning")
185
 
186
  filters_levelone = ['WavCaps-Test',
187
  'AudioCaps-Test']
@@ -190,29 +161,17 @@ def ac():
190
  left, center, _, middle,right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
191
 
192
  with left:
193
- filter_1 = st.selectbox('Select Dataset', filters_levelone)
194
  with middle:
195
- metric = st.selectbox('Select Metric', filters_leveltwo)
196
-
197
- # with middle:
198
- # if filter_1 == filters_levelone[0]:
199
- # sort_leveltwo = ['Clotho-AQA-Test', 'WavCaps-QA-Test', 'AudioCaps-QA-Test']
200
- # elif filter_1 == filters_levelone[1]:
201
- # sort_leveltwo = ['WavCaps-Test', 'AudioCaps-Test']
202
-
203
- # sort = st.selectbox("Sort Dataset", sort_leveltwo)
204
-
205
- # with right:
206
- # sorted = st.selectbox('by', ['Ascending', 'Descending'])
207
 
208
  if filter_1 or metric:
209
  dataset_contents(ac_datasets[filter_1], metrics[metric.lower().replace('-', '_')])
210
  draw('asu', 'AC',filter_1, metric.lower().replace('-', '_'))
211
- # else:
212
- # draw('asu', 'AC', 'WavCaps-Test', 'llama3_70b_judge')
213
 
214
  def asqa():
215
- st.title("Audio Scene Question Answering")
216
 
217
  filters_levelone = ['Clotho-AQA-Test',
218
  'WavCaps-QA-Test',
@@ -221,57 +180,39 @@ def asqa():
221
  left, center, _, middle,right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
222
 
223
  with left:
224
- filter_1 = st.selectbox('Select Dataset', filters_levelone)
225
 
226
  if filter_1:
227
  dataset_contents(asqa_datasets[filter_1], metrics['llama3_70b_judge'])
228
  draw('asu', 'AQA',filter_1, 'llama3_70b_judge')
229
- # else:
230
- # draw('asu', 'AQA', 'Clotho-AQA-Test', 'llama3_70b_judge')
231
 
232
  def er():
233
- st.title("Emotion Recognition")
234
 
235
  filters_levelone = ['IEMOCAP-Emotion-Test',
236
  'MELD-Sentiment-Test',
237
  'MELD-Emotion-Test']
238
- # sort_leveltwo = []
239
 
240
  left, center, _, middle,right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
241
 
242
  with left:
243
- filter_1 = st.selectbox('Select Dataset', filters_levelone)
244
-
245
- # with middle:
246
- # if filter_1 == filters_levelone[0]:
247
- # sort_leveltwo = ['IEMOCAP-Emotion-Test', 'MELD-Sentiment-Test', 'MELD-Emotion-Test']
248
-
249
- # elif filter_1 == filters_levelone[1]:
250
- # sort_leveltwo = ['VoxCeleb1-Accent-Test']
251
-
252
- # elif filter_1 == filters_levelone[2]:
253
- # sort_leveltwo = ['VoxCeleb1-Gender-Test', 'IEMOCAP-Gender-Test']
254
-
255
- # sort = st.selectbox("Sort Dataset", sort_leveltwo)
256
-
257
- # with right:
258
- # sorted = st.selectbox('by', ['Ascending', 'Descending'])
259
 
260
  if filter_1:
261
  dataset_contents(er_datasets[filter_1], metrics['llama3_70b_judge_binary'])
262
  draw('vu', 'ER', filter_1, 'llama3_70b_judge_binary')
263
- # else:
264
- # draw('vu', 'ER', 'IEMOCAP-Emotion-Test', 'llama3_70b_judge_binary')
265
 
266
  def ar():
267
- st.title("Accent Recognition")
268
 
269
  filters_levelone = ['VoxCeleb-Accent-Test']
270
 
271
  left, center, _, middle,right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
272
 
273
  with left:
274
- filter_1 = st.selectbox('Select Dataset', filters_levelone)
275
 
276
 
277
  if filter_1:
@@ -280,7 +221,7 @@ def ar():
280
 
281
 
282
  def gr():
283
- st.title("Gender Recognition")
284
 
285
  filters_levelone = ['VoxCeleb-Gender-Test',
286
  'IEMOCAP-Gender-Test']
@@ -288,16 +229,15 @@ def gr():
288
  left, center, _, middle,right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
289
 
290
  with left:
291
- filter_1 = st.selectbox('Select Dataset', filters_levelone)
292
 
293
  if filter_1:
294
  dataset_contents(gr_datasets[filter_1], metrics['llama3_70b_judge_binary'])
295
  draw('vu', 'GR', filter_1, 'llama3_70b_judge_binary')
296
- # else:
297
- # draw('vu', 'GR', 'VoxCeleb1-Gender-Test', 'llama3_70b_judge_binary')
298
 
299
  def spt():
300
- st.title("Speech Translation")
301
 
302
  filters_levelone = ['Covost2-EN-ID-test',
303
  'Covost2-EN-ZH-test',
@@ -309,7 +249,7 @@ def spt():
309
  left, center, _, middle,right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
310
 
311
  with left:
312
- filter_1 = st.selectbox('Select Dataset', filters_levelone)
313
 
314
  if filter_1:
315
  dataset_contents(spt_datasets[filter_1], metrics['bleu'])
@@ -318,17 +258,15 @@ def spt():
318
  # draw('su', 'ST', 'Covost2-EN-ID-test', 'bleu')
319
 
320
  def cnasr():
321
- st.title("Chinese Automatic Speech Recognition")
322
 
323
  filters_levelone = ['Aishell-ASR-ZH-Test']
324
 
325
  left, center, _, middle,right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
326
 
327
  with left:
328
- filter_1 = st.selectbox('Select Dataset', filters_levelone)
329
 
330
  if filter_1:
331
  dataset_contents(cnasr_datasets[filter_1], metrics['wer'])
332
  draw('su', 'CNASR', filter_1, 'wer')
333
- # else:
334
- # draw('su', 'CNASR', 'Aishell-ASR-ZH-Test', 'wer')
 
9
  .my-dataset-info {
10
  # background-color: #F9EBEA;
11
  # padding: 10px;
12
+ color: #050505;
13
+ font-style: normal;
14
  font-size: 8px;
15
  height: auto;
16
  }
 
18
  """
19
  st.markdown(custom_css, unsafe_allow_html=True)
20
  st.markdown(f"""<div class="my-dataset-info">
21
+ <p><b>About this dataset</b>: {dataset}</p>
22
  </div>""", unsafe_allow_html=True)
23
  st.markdown(f"""<div class="my-dataset-info">
24
+ <p><b>About this metric</b>: {metrics}</p>
25
  </div>""", unsafe_allow_html=True)
26
 
27
 
 
38
 
39
  audio_url = "https://arxiv.org/abs/2406.16020"
40
 
41
+
42
+ st.markdown("#### News")
43
+ st.markdown("Dec, 2024: Update layout and support comparison between models with similar model sizes.")
44
+
45
  st.divider()
46
+
47
+ st.markdown("#### What is [AudioBench](%s)?" % audio_url)
48
+ st.markdown("##### :dizzy: A comprehensive evaluation benchmark designed for general instruction-following audiolanguage models.")
49
+ st.markdown("##### :dizzy: A evaluation benchmark that we consistently put effort in updating and maintaining.")
50
  st.markdown('''
 
 
51
  ''')
52
 
53
  with st.container():
 
55
  with center_co:
56
  st.image("./style/audio_overview.png",
57
  caption="Overview of the datasets in AudioBench.",
58
+ use_container_width = True
59
+ )
60
 
61
  st.markdown('''
62
 
 
65
 
66
  st.markdown("###### :dart: Our Benchmark includes: ")
67
  cols = st.columns(10)
68
+ cols[1].metric(label="Tasks", value=">8") #delta="Tasks", delta_color="off"
69
+ cols[2].metric(label="Datasets", value=">30")
70
+ cols[3].metric(label="Evaluated Models", value=">5")
 
 
 
 
 
 
 
 
 
 
 
 
71
 
72
 
73
  st.divider()
 
85
  ''')
86
 
87
  def asr():
88
+ st.title("Task: Automatic Speech Recognition")
89
 
90
  filters_levelone = ['LibriSpeech-Test-Clean',
91
  'LibriSpeech-Test-Other',
 
96
  'Earnings22-Test',
97
  'Tedlium3-Test',
98
  'Tedlium3-Long-form-Test',
99
+ #'IMDA-Part1-ASR-Test',
100
+ #'IMDA-Part2-ASR-Test'
101
+ ]
102
 
103
  left, center, _, middle,right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
104
 
105
  with left:
106
+ filter_1 = st.selectbox('Dataset', filters_levelone)
107
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
  if filter_1:
109
  dataset_contents(asr_datsets[filter_1], metrics['wer'])
110
  draw('su', 'ASR', filter_1, 'wer', cus_sort=True)
 
 
111
 
 
 
112
 
113
 
114
  def sqa():
115
+ st.title("Task: Speech Question Answering")
116
 
117
  binary = ['CN-College-Listen-MCQ-Test', 'DREAM-TTS-MCQ-Test']
118
 
 
125
  left, center, _, middle,right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
126
 
127
  with left:
128
+ filter_1 = st.selectbox('Dataset', filters_levelone)
129
 
130
  if filter_1:
131
  if filter_1 in binary:
 
135
  else:
136
  dataset_contents(sqa_datasets[filter_1], metrics['llama3_70b_judge'])
137
  draw('su', 'SQA', filter_1, 'llama3_70b_judge')
 
 
138
 
139
  def si():
140
+ st.title("Task: Speech Instruction")
141
 
142
  filters_levelone = ['OpenHermes-Audio-Test',
143
  'ALPACA-Audio-Test']
 
145
  left, center, _, middle,right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
146
 
147
  with left:
148
+ filter_1 = st.selectbox('Dataset', filters_levelone)
149
 
150
  if filter_1:
151
  dataset_contents(si_datasets[filter_1], metrics['llama3_70b_judge'])
152
  draw('su', 'SI', filter_1, 'llama3_70b_judge')
 
 
153
 
154
  def ac():
155
+ st.title("Task: Audio Captioning")
156
 
157
  filters_levelone = ['WavCaps-Test',
158
  'AudioCaps-Test']
 
161
  left, center, _, middle,right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
162
 
163
  with left:
164
+ filter_1 = st.selectbox('Dataset', filters_levelone)
165
  with middle:
166
+ metric = st.selectbox('Metric', filters_leveltwo)
 
 
 
 
 
 
 
 
 
 
 
167
 
168
  if filter_1 or metric:
169
  dataset_contents(ac_datasets[filter_1], metrics[metric.lower().replace('-', '_')])
170
  draw('asu', 'AC',filter_1, metric.lower().replace('-', '_'))
171
+
 
172
 
173
  def asqa():
174
+ st.title("Task: Audio Scene Question Answering")
175
 
176
  filters_levelone = ['Clotho-AQA-Test',
177
  'WavCaps-QA-Test',
 
180
  left, center, _, middle,right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
181
 
182
  with left:
183
+ filter_1 = st.selectbox('Dataset', filters_levelone)
184
 
185
  if filter_1:
186
  dataset_contents(asqa_datasets[filter_1], metrics['llama3_70b_judge'])
187
  draw('asu', 'AQA',filter_1, 'llama3_70b_judge')
188
+
 
189
 
190
  def er():
191
+ st.title("Task: Emotion Recognition")
192
 
193
  filters_levelone = ['IEMOCAP-Emotion-Test',
194
  'MELD-Sentiment-Test',
195
  'MELD-Emotion-Test']
 
196
 
197
  left, center, _, middle,right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
198
 
199
  with left:
200
+ filter_1 = st.selectbox('Dataset', filters_levelone)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
201
 
202
  if filter_1:
203
  dataset_contents(er_datasets[filter_1], metrics['llama3_70b_judge_binary'])
204
  draw('vu', 'ER', filter_1, 'llama3_70b_judge_binary')
205
+
 
206
 
207
  def ar():
208
+ st.title("Task: Accent Recognition")
209
 
210
  filters_levelone = ['VoxCeleb-Accent-Test']
211
 
212
  left, center, _, middle,right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
213
 
214
  with left:
215
+ filter_1 = st.selectbox('Dataset', filters_levelone)
216
 
217
 
218
  if filter_1:
 
221
 
222
 
223
  def gr():
224
+ st.title("Task: Gender Recognition")
225
 
226
  filters_levelone = ['VoxCeleb-Gender-Test',
227
  'IEMOCAP-Gender-Test']
 
229
  left, center, _, middle,right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
230
 
231
  with left:
232
+ filter_1 = st.selectbox('Dataset', filters_levelone)
233
 
234
  if filter_1:
235
  dataset_contents(gr_datasets[filter_1], metrics['llama3_70b_judge_binary'])
236
  draw('vu', 'GR', filter_1, 'llama3_70b_judge_binary')
237
+
 
238
 
239
  def spt():
240
+ st.title("Task: Speech Translation")
241
 
242
  filters_levelone = ['Covost2-EN-ID-test',
243
  'Covost2-EN-ZH-test',
 
249
  left, center, _, middle,right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
250
 
251
  with left:
252
+ filter_1 = st.selectbox('Dataset', filters_levelone)
253
 
254
  if filter_1:
255
  dataset_contents(spt_datasets[filter_1], metrics['bleu'])
 
258
  # draw('su', 'ST', 'Covost2-EN-ID-test', 'bleu')
259
 
260
  def cnasr():
261
+ st.title("Task: Automatic Speech Recognition (Chinese)")
262
 
263
  filters_levelone = ['Aishell-ASR-ZH-Test']
264
 
265
  left, center, _, middle,right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
266
 
267
  with left:
268
+ filter_1 = st.selectbox('Dataset', filters_levelone)
269
 
270
  if filter_1:
271
  dataset_contents(cnasr_datasets[filter_1], metrics['wer'])
272
  draw('su', 'CNASR', filter_1, 'wer')
 
 
app/show_examples.py CHANGED
@@ -2,6 +2,9 @@ import streamlit as st
2
  import datasets
3
  import numpy as np
4
 
 
 
 
5
  def show_examples(category_name, dataset_name, model_lists, display_model_names):
6
  st.divider()
7
  sample_folder = f"./examples/{category_name}/{dataset_name}"
@@ -16,57 +19,6 @@ def show_examples(category_name, dataset_name, model_lists, display_model_names)
16
  # with col1:
17
  st.audio(f'{sample_folder}/sample_{index}.wav', format="audio/wav")
18
 
19
- # with col2:
20
- # with st.container():
21
- # custom_css = """
22
- # <style>
23
- # .my-container-question {
24
- # background-color: #F5EEF8;
25
- # padding: 10px;
26
- # border-radius: 10px;
27
- # height: auto;
28
- # }
29
- # </style>
30
- # """
31
- # st.markdown(custom_css, unsafe_allow_html=True)
32
-
33
- # if dataset_name in ['CN-College-Listen-MCQ-Test', 'DREAM-TTS-MCQ-Test']:
34
-
35
- # choices = dataset[index]['other_attributes']['choices']
36
- # if isinstance(choices, str):
37
- # choices_text = choices
38
- # elif isinstance(choices, list):
39
- # choices_text = ' '.join(i for i in choices)
40
-
41
- # question_text = f"""<div class="my-container-question">
42
- # <p>QUESTION: {dataset[index]['instruction']['text']}</p>
43
- # <p>CHOICES: {choices_text}</p>
44
- # </div>
45
- # """
46
- # else:
47
- # question_text = f"""<div class="my-container-question">
48
- # <p>QUESTION: {dataset[index]['instruction']['text']}</p>
49
- # </div>"""
50
-
51
-
52
- # st.markdown(question_text, unsafe_allow_html=True)
53
-
54
- # with st.container():
55
- # custom_css = """
56
- # <style>
57
- # .my-container-answer {
58
- # background-color: #F9EBEA;
59
- # padding: 10px;
60
- # border-radius: 10px;
61
- # height: auto;
62
- # }
63
- # </style>
64
- # """
65
- # st.markdown(custom_css, unsafe_allow_html=True)
66
- # st.markdown(f"""<div class="my-container-answer">
67
- # <p>CORRECT ANSWER: {dataset[index]['answer']['text']}</p>
68
- # </div>""", unsafe_allow_html=True)
69
-
70
  if dataset_name in ['CN-College-Listen-MCQ-Test', 'DREAM-TTS-MCQ-Test']:
71
 
72
  choices = dataset[index]['other_attributes']['choices']
@@ -78,6 +30,8 @@ def show_examples(category_name, dataset_name, model_lists, display_model_names)
78
  question_text = f"""{dataset[index]['instruction']['text']} {choices_text}"""
79
  else:
80
  question_text = f"""{dataset[index]['instruction']['text']}"""
 
 
81
 
82
  # st.divider()
83
  with st.container():
@@ -99,33 +53,44 @@ def show_examples(category_name, dataset_name, model_lists, display_model_names)
99
 
100
  s = f"""<tr>
101
  <td><b>REFERENCE</td>
102
- <td><b>{question_text.replace('(A)', '<br>(A)').replace('(B)', '<br>(B)').replace('(C)', '<br>(C)')}
103
  </td>
104
- <td><b>{dataset[index]['answer']['text']}
105
  </td>
106
  </tr>
107
  """
108
  if dataset_name in ['CN-College-Listen-MCQ-Test', 'DREAM-TTS-MCQ-Test']:
109
  for model in model_lists:
110
  try:
 
 
 
 
111
  s += f"""<tr>
112
  <td>{display_model_names[model]}</td>
113
  <td>
114
  {dataset[index][model]['text'].replace('Choices:', '<br>Choices:').replace('(A)', '<br>(A)').replace('(B)', '<br>(B)').replace('(C)', '<br>(C)')
115
  }
116
  </td>
117
- <td>{dataset[index][model]['model_prediction']}</td>
118
  </tr>"""
119
  except:
120
  print(f"{model} is not in {dataset_name}")
121
  continue
122
  else:
123
  for model in model_lists:
 
 
 
124
  try:
 
 
 
 
125
  s += f"""<tr>
126
  <td>{display_model_names[model]}</td>
127
- <td>{dataset[index][model]['text']}</td>
128
- <td>{dataset[index][model]['model_prediction']}</td>
129
  </tr>"""
130
  except:
131
  print(f"{model} is not in {dataset_name}")
@@ -136,8 +101,8 @@ def show_examples(category_name, dataset_name, model_lists, display_model_names)
136
  <thead>
137
  <tr style="text-align: center;">
138
  <th style="width:20%">MODEL</th>
139
- <th style="width:40%">QUESTION</th>
140
- <th style="width:40%">MODEL PREDICTION</th>
141
  </tr>
142
  {s}
143
  </thead>
 
2
  import datasets
3
  import numpy as np
4
 
5
+ import html
6
+
7
+
8
  def show_examples(category_name, dataset_name, model_lists, display_model_names):
9
  st.divider()
10
  sample_folder = f"./examples/{category_name}/{dataset_name}"
 
19
  # with col1:
20
  st.audio(f'{sample_folder}/sample_{index}.wav', format="audio/wav")
21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  if dataset_name in ['CN-College-Listen-MCQ-Test', 'DREAM-TTS-MCQ-Test']:
23
 
24
  choices = dataset[index]['other_attributes']['choices']
 
30
  question_text = f"""{dataset[index]['instruction']['text']} {choices_text}"""
31
  else:
32
  question_text = f"""{dataset[index]['instruction']['text']}"""
33
+
34
+ question_text = html.escape(question_text)
35
 
36
  # st.divider()
37
  with st.container():
 
53
 
54
  s = f"""<tr>
55
  <td><b>REFERENCE</td>
56
+ <td><b>{html.escape(question_text.replace('(A)', '<br>(A)').replace('(B)', '<br>(B)').replace('(C)', '<br>(C)'))}
57
  </td>
58
+ <td><b>{html.escape(dataset[index]['answer']['text'])}
59
  </td>
60
  </tr>
61
  """
62
  if dataset_name in ['CN-College-Listen-MCQ-Test', 'DREAM-TTS-MCQ-Test']:
63
  for model in model_lists:
64
  try:
65
+
66
+ model_prediction = dataset[index][model]['model_prediction']
67
+ model_prediction = model_prediction.replace('<','').replace('>','').replace('\n','(newline)').replace('*','')
68
+
69
  s += f"""<tr>
70
  <td>{display_model_names[model]}</td>
71
  <td>
72
  {dataset[index][model]['text'].replace('Choices:', '<br>Choices:').replace('(A)', '<br>(A)').replace('(B)', '<br>(B)').replace('(C)', '<br>(C)')
73
  }
74
  </td>
75
+ <td>{html.escape(model_prediction)}</td>
76
  </tr>"""
77
  except:
78
  print(f"{model} is not in {dataset_name}")
79
  continue
80
  else:
81
  for model in model_lists:
82
+
83
+ print(dataset[index][model]['model_prediction'])
84
+
85
  try:
86
+
87
+ model_prediction = dataset[index][model]['model_prediction']
88
+ model_prediction = model_prediction.replace('<','').replace('>','').replace('\n','(newline)').replace('*','')
89
+
90
  s += f"""<tr>
91
  <td>{display_model_names[model]}</td>
92
+ <td>{html.escape(dataset[index][model]['text'])}</td>
93
+ <td>{html.escape(model_prediction)}</td>
94
  </tr>"""
95
  except:
96
  print(f"{model} is not in {dataset_name}")
 
101
  <thead>
102
  <tr style="text-align: center;">
103
  <th style="width:20%">MODEL</th>
104
+ <th style="width:30%">QUESTION</th>
105
+ <th style="width:50%">MODEL PREDICTION</th>
106
  </tr>
107
  {s}
108
  </thead>