zhuohan-7 commited on
Commit
101c142
1 Parent(s): c751340

Upload folder using huggingface_hub

Browse files
Files changed (3) hide show
  1. app/draw_diagram.py +33 -9
  2. app/pages.py +83 -35
  3. app/summarization.py +118 -0
app/draw_diagram.py CHANGED
@@ -20,8 +20,6 @@ from model_information import get_dataframe
20
 
21
  info_df = get_dataframe()
22
 
23
-
24
-
25
  # def nav_to(value):
26
  # try:
27
  # url = links_dic[str(value).lower()]
@@ -90,6 +88,16 @@ def draw(folder_name, category_name, dataset_name, metrics, cus_sort=True):
90
 
91
  cur_dataset_name = chart_data_table.columns[1]
92
 
 
 
 
 
 
 
 
 
 
 
93
  if cur_dataset_name in [
94
  'librispeech_test_clean',
95
  'librispeech_test_other',
@@ -105,19 +113,35 @@ def draw(folder_name, category_name, dataset_name, metrics, cus_sort=True):
105
  'aishell_asr_zh_test',
106
  ]:
107
 
108
- styled_df = chart_data_table.style.highlight_min(
109
- subset=[chart_data_table.columns[1]], color='yellow'
110
- )
 
111
  else:
112
-
113
  chart_data_table = chart_data_table.sort_values(
114
  by=chart_data_table.columns[1],
115
  ascending=False
116
  ).reset_index(drop=True)
 
 
 
 
117
 
118
- styled_df = chart_data_table.style.highlight_max(
119
- subset=[chart_data_table.columns[1]], color='yellow'
120
- )
 
 
 
 
 
 
 
 
 
 
 
 
121
 
122
 
123
  st.dataframe(
 
20
 
21
  info_df = get_dataframe()
22
 
 
 
23
  # def nav_to(value):
24
  # try:
25
  # url = links_dic[str(value).lower()]
 
88
 
89
  cur_dataset_name = chart_data_table.columns[1]
90
 
91
+
92
+ def highlight_first_element(x):
93
+ # Create a DataFrame with the same shape as the input
94
+ df_style = pd.DataFrame('', index=x.index, columns=x.columns)
95
+
96
+ # Apply background color to the first element in row 0 (df[0][0])
97
+ df_style.iloc[0, 1] = 'background-color: #b0c1d7; color: white'
98
+
99
+ return df_style
100
+
101
  if cur_dataset_name in [
102
  'librispeech_test_clean',
103
  'librispeech_test_other',
 
113
  'aishell_asr_zh_test',
114
  ]:
115
 
116
+ chart_data_table = chart_data_table.sort_values(
117
+ by=chart_data_table.columns[1],
118
+ ascending=True
119
+ ).reset_index(drop=True)
120
  else:
 
121
  chart_data_table = chart_data_table.sort_values(
122
  by=chart_data_table.columns[1],
123
  ascending=False
124
  ).reset_index(drop=True)
125
+
126
+ # styled_df = chart_data_table.style.highlight_min(
127
+ # subset=[chart_data_table.columns[1]], color='yellow'
128
+ # )
129
 
130
+ styled_df = chart_data_table.style.apply(
131
+ highlight_first_element, axis=None
132
+ )
133
+
134
+ # else:
135
+
136
+
137
+
138
+ # # styled_df = chart_data_table.style.highlight_max(
139
+ # # subset=[chart_data_table.columns[1]], color='yellow'
140
+ # # )
141
+
142
+ # styled_df = chart_data_table.style.apply(
143
+ # highlight_first_element, axis=None
144
+ # )
145
 
146
 
147
  st.dataframe(
app/pages.py CHANGED
@@ -1,6 +1,7 @@
1
  import streamlit as st
2
  from app.draw_diagram import *
3
  from app.content import *
 
4
 
5
  def dataset_contents(dataset, metrics):
6
 
@@ -87,18 +88,22 @@ def dashboard():
87
  def asr():
88
  st.title("Task: Automatic Speech Recognition")
89
 
90
- filters_levelone = ['LibriSpeech-Test-Clean',
91
- 'LibriSpeech-Test-Other',
92
- 'Common-Voice-15-En-Test',
93
- 'Peoples-Speech-Test',
94
- 'GigaSpeech-Test',
95
- 'Earnings21-Test',
96
- 'Earnings22-Test',
97
- 'Tedlium3-Test',
98
- 'Tedlium3-Long-form-Test',
99
- #'IMDA-Part1-ASR-Test',
100
- #'IMDA-Part2-ASR-Test'
101
- ]
 
 
 
 
102
 
103
  left, center, _, middle,right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
104
 
@@ -106,21 +111,26 @@ def asr():
106
  filter_1 = st.selectbox('Dataset', filters_levelone)
107
 
108
  if filter_1:
109
- dataset_contents(asr_datsets[filter_1], metrics['wer'])
110
- draw('su', 'ASR', filter_1, 'wer', cus_sort=True)
 
 
 
111
 
112
 
113
 
114
  def sqa():
115
  st.title("Task: Speech Question Answering")
116
 
 
 
117
  binary = ['CN-College-Listen-MCQ-Test', 'DREAM-TTS-MCQ-Test']
118
 
119
  rest = ['SLUE-P2-SQA5-Test',
120
  'Public-SG-Speech-QA-Test',
121
  'Spoken-Squad-Test']
122
 
123
- filters_levelone = binary + rest
124
 
125
  left, center, _, middle,right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
126
 
@@ -128,7 +138,10 @@ def sqa():
128
  filter_1 = st.selectbox('Dataset', filters_levelone)
129
 
130
  if filter_1:
131
- if filter_1 in binary:
 
 
 
132
  dataset_contents(sqa_datasets[filter_1], metrics['llama3_70b_judge_binary'])
133
  draw('su', 'SQA', filter_1, 'llama3_70b_judge_binary')
134
 
@@ -139,8 +152,12 @@ def sqa():
139
  def si():
140
  st.title("Task: Speech Instruction")
141
 
142
- filters_levelone = ['OpenHermes-Audio-Test',
143
- 'ALPACA-Audio-Test']
 
 
 
 
144
 
145
  left, center, _, middle,right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
146
 
@@ -148,8 +165,11 @@ def si():
148
  filter_1 = st.selectbox('Dataset', filters_levelone)
149
 
150
  if filter_1:
151
- dataset_contents(si_datasets[filter_1], metrics['llama3_70b_judge'])
152
- draw('su', 'SI', filter_1, 'llama3_70b_judge')
 
 
 
153
 
154
  def ac():
155
  st.title("Task: Audio Captioning")
@@ -173,9 +193,13 @@ def ac():
173
  def asqa():
174
  st.title("Task: Audio Scene Question Answering")
175
 
176
- filters_levelone = ['Clotho-AQA-Test',
177
- 'WavCaps-QA-Test',
178
- 'AudioCaps-QA-Test']
 
 
 
 
179
 
180
  left, center, _, middle,right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
181
 
@@ -183,16 +207,23 @@ def asqa():
183
  filter_1 = st.selectbox('Dataset', filters_levelone)
184
 
185
  if filter_1:
186
- dataset_contents(asqa_datasets[filter_1], metrics['llama3_70b_judge'])
187
- draw('asu', 'AQA',filter_1, 'llama3_70b_judge')
 
 
 
188
 
189
 
190
  def er():
191
  st.title("Task: Emotion Recognition")
192
 
193
- filters_levelone = ['IEMOCAP-Emotion-Test',
 
 
194
  'MELD-Sentiment-Test',
195
  'MELD-Emotion-Test']
 
 
196
 
197
  left, center, _, middle,right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
198
 
@@ -200,8 +231,11 @@ def er():
200
  filter_1 = st.selectbox('Dataset', filters_levelone)
201
 
202
  if filter_1:
203
- dataset_contents(er_datasets[filter_1], metrics['llama3_70b_judge_binary'])
204
- draw('vu', 'ER', filter_1, 'llama3_70b_judge_binary')
 
 
 
205
 
206
 
207
  def ar():
@@ -216,15 +250,21 @@ def ar():
216
 
217
 
218
  if filter_1:
 
 
 
219
  dataset_contents(ar_datsets[filter_1], metrics['llama3_70b_judge'])
220
  draw('vu', 'AR', filter_1, 'llama3_70b_judge')
221
 
222
 
223
  def gr():
224
  st.title("Task: Gender Recognition")
 
225
 
226
- filters_levelone = ['VoxCeleb-Gender-Test',
227
  'IEMOCAP-Gender-Test']
 
 
228
 
229
  left, center, _, middle,right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
230
 
@@ -232,19 +272,24 @@ def gr():
232
  filter_1 = st.selectbox('Dataset', filters_levelone)
233
 
234
  if filter_1:
235
- dataset_contents(gr_datasets[filter_1], metrics['llama3_70b_judge_binary'])
236
- draw('vu', 'GR', filter_1, 'llama3_70b_judge_binary')
 
 
 
237
 
238
 
239
  def spt():
240
  st.title("Task: Speech Translation")
241
-
242
- filters_levelone = ['Covost2-EN-ID-test',
243
  'Covost2-EN-ZH-test',
244
  'Covost2-EN-TA-test',
245
  'Covost2-ID-EN-test',
246
  'Covost2-ZH-EN-test',
247
  'Covost2-TA-EN-test']
 
 
248
 
249
  left, center, _, middle,right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
250
 
@@ -252,8 +297,11 @@ def spt():
252
  filter_1 = st.selectbox('Dataset', filters_levelone)
253
 
254
  if filter_1:
255
- dataset_contents(spt_datasets[filter_1], metrics['bleu'])
256
- draw('su', 'ST', filter_1, 'bleu')
 
 
 
257
 
258
 
259
  def cnasr():
 
1
  import streamlit as st
2
  from app.draw_diagram import *
3
  from app.content import *
4
+ from app.summarization import *
5
 
6
  def dataset_contents(dataset, metrics):
7
 
 
88
  def asr():
89
  st.title("Task: Automatic Speech Recognition")
90
 
91
+ sum = ['Summarization']
92
+
93
+ dataset_lists = ['LibriSpeech-Test-Clean',
94
+ 'LibriSpeech-Test-Other',
95
+ 'Common-Voice-15-En-Test',
96
+ 'Peoples-Speech-Test',
97
+ 'GigaSpeech-Test',
98
+ 'Earnings21-Test',
99
+ 'Earnings22-Test',
100
+ 'Tedlium3-Test',
101
+ 'Tedlium3-Long-form-Test',
102
+ #'IMDA-Part1-ASR-Test',
103
+ #'IMDA-Part2-ASR-Test'
104
+ ]
105
+
106
+ filters_levelone = sum + dataset_lists
107
 
108
  left, center, _, middle,right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
109
 
 
111
  filter_1 = st.selectbox('Dataset', filters_levelone)
112
 
113
  if filter_1:
114
+ if filter_1 in sum:
115
+ sum_table_mulit_metrix('ASR', ['wer'])
116
+ else:
117
+ dataset_contents(asr_datsets[filter_1], metrics['wer'])
118
+ draw('su', 'ASR', filter_1, 'wer', cus_sort=True)
119
 
120
 
121
 
122
  def sqa():
123
  st.title("Task: Speech Question Answering")
124
 
125
+ sum = ['Summarization']
126
+
127
  binary = ['CN-College-Listen-MCQ-Test', 'DREAM-TTS-MCQ-Test']
128
 
129
  rest = ['SLUE-P2-SQA5-Test',
130
  'Public-SG-Speech-QA-Test',
131
  'Spoken-Squad-Test']
132
 
133
+ filters_levelone = sum + binary + rest
134
 
135
  left, center, _, middle,right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
136
 
 
138
  filter_1 = st.selectbox('Dataset', filters_levelone)
139
 
140
  if filter_1:
141
+ if filter_1 in sum:
142
+ sum_table_mulit_metrix('SQA', ['llama3_70b_judge_binary', 'llama3_70b_judge'])
143
+
144
+ elif filter_1 in binary:
145
  dataset_contents(sqa_datasets[filter_1], metrics['llama3_70b_judge_binary'])
146
  draw('su', 'SQA', filter_1, 'llama3_70b_judge_binary')
147
 
 
152
  def si():
153
  st.title("Task: Speech Instruction")
154
 
155
+ sum = ['Summarization']
156
+
157
+ dataset_lists = ['OpenHermes-Audio-Test',
158
+ 'ALPACA-Audio-Test']
159
+
160
+ filters_levelone = sum + dataset_lists
161
 
162
  left, center, _, middle,right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
163
 
 
165
  filter_1 = st.selectbox('Dataset', filters_levelone)
166
 
167
  if filter_1:
168
+ if filter_1 in sum:
169
+ sum_table_mulit_metrix('SI', ['llama3_70b_judge'])
170
+ else:
171
+ dataset_contents(si_datasets[filter_1], metrics['llama3_70b_judge'])
172
+ draw('su', 'SI', filter_1, 'llama3_70b_judge')
173
 
174
  def ac():
175
  st.title("Task: Audio Captioning")
 
193
  def asqa():
194
  st.title("Task: Audio Scene Question Answering")
195
 
196
+ sum = ['Summarization']
197
+
198
+ dataset_lists = ['Clotho-AQA-Test',
199
+ 'WavCaps-QA-Test',
200
+ 'AudioCaps-QA-Test']
201
+
202
+ filters_levelone = sum + dataset_lists
203
 
204
  left, center, _, middle,right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
205
 
 
207
  filter_1 = st.selectbox('Dataset', filters_levelone)
208
 
209
  if filter_1:
210
+ if filter_1 in sum:
211
+ sum_table_mulit_metrix('AQA', ['llama3_70b_judge'])
212
+ else:
213
+ dataset_contents(asqa_datasets[filter_1], metrics['llama3_70b_judge'])
214
+ draw('asu', 'AQA',filter_1, 'llama3_70b_judge')
215
 
216
 
217
  def er():
218
  st.title("Task: Emotion Recognition")
219
 
220
+ sum = ['Summarization']
221
+
222
+ dataset_lists = ['IEMOCAP-Emotion-Test',
223
  'MELD-Sentiment-Test',
224
  'MELD-Emotion-Test']
225
+
226
+ filters_levelone = sum + dataset_lists
227
 
228
  left, center, _, middle,right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
229
 
 
231
  filter_1 = st.selectbox('Dataset', filters_levelone)
232
 
233
  if filter_1:
234
+ if filter_1 in sum:
235
+ sum_table_mulit_metrix('ER', ['llama3_70b_judge_binary'])
236
+ else:
237
+ dataset_contents(er_datasets[filter_1], metrics['llama3_70b_judge_binary'])
238
+ draw('vu', 'ER', filter_1, 'llama3_70b_judge_binary')
239
 
240
 
241
  def ar():
 
250
 
251
 
252
  if filter_1:
253
+ # if filter_1 in sum:
254
+ # sum_table('aR', 'llama3_70b_judge')
255
+ # else:
256
  dataset_contents(ar_datsets[filter_1], metrics['llama3_70b_judge'])
257
  draw('vu', 'AR', filter_1, 'llama3_70b_judge')
258
 
259
 
260
  def gr():
261
  st.title("Task: Gender Recognition")
262
+ sum = ['Summarization']
263
 
264
+ dataset_lists = ['VoxCeleb-Gender-Test',
265
  'IEMOCAP-Gender-Test']
266
+
267
+ filters_levelone = sum + dataset_lists
268
 
269
  left, center, _, middle,right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
270
 
 
272
  filter_1 = st.selectbox('Dataset', filters_levelone)
273
 
274
  if filter_1:
275
+ if filter_1 in sum:
276
+ sum_table_mulit_metrix('GR', ['llama3_70b_judge_binary'])
277
+ else:
278
+ dataset_contents(gr_datasets[filter_1], metrics['llama3_70b_judge_binary'])
279
+ draw('vu', 'GR', filter_1, 'llama3_70b_judge_binary')
280
 
281
 
282
  def spt():
283
  st.title("Task: Speech Translation")
284
+ sum = ['Summarization']
285
+ dataset_lists = ['Covost2-EN-ID-test',
286
  'Covost2-EN-ZH-test',
287
  'Covost2-EN-TA-test',
288
  'Covost2-ID-EN-test',
289
  'Covost2-ZH-EN-test',
290
  'Covost2-TA-EN-test']
291
+
292
+ filters_levelone = sum + dataset_lists
293
 
294
  left, center, _, middle,right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
295
 
 
297
  filter_1 = st.selectbox('Dataset', filters_levelone)
298
 
299
  if filter_1:
300
+ if filter_1 in sum:
301
+ sum_table_mulit_metrix('ST', ['bleu'])
302
+ else:
303
+ dataset_contents(spt_datasets[filter_1], metrics['bleu'])
304
+ draw('su', 'ST', filter_1, 'bleu')
305
 
306
 
307
  def cnasr():
app/summarization.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ from streamlit_echarts import st_echarts
5
+ from streamlit.components.v1 import html
6
+ # from PIL import Image
7
+ from app.show_examples import *
8
+ import pandas as pd
9
+ from typing import List
10
+
11
+ from model_information import get_dataframe
12
+
13
+ info_df = get_dataframe()
14
+
15
+ metrics_info = {
16
+ 'wer': 'Word Error Rate (WER), a common metric for ASR evaluation. (The lower, the better)',
17
+ 'llama3_70b_judge_binary': 'Binary evaluation using the LLAMA3-70B model, for tasks requiring a binary outcome. (0-100 based on score 0-1)',
18
+ 'llama3_70b_judge': 'General evaluation using the LLAMA3-70B model, typically scoring based on subjective judgments. (0-100 based on score 0-5)',
19
+ 'meteor': 'METEOR, a metric used for evaluating text generation, often used in translation or summarization tasks. (Sensitive to output length)',
20
+ 'bleu': 'BLEU (Bilingual Evaluation Understudy), another text generation evaluation metric commonly used in machine translation. (Sensitive to output length)',
21
+ }
22
+
23
+ def sum_table_mulit_metrix(task_name, metrics_lists: List[str]):
24
+
25
+ for metrics in metrics_lists:
26
+ folder = f"./results/{metrics}/"
27
+ data_path = f'{folder}/{task_name.lower()}.csv'
28
+
29
+ chart_data = pd.read_csv(data_path).round(3)
30
+ selected_columns = [i for i in chart_data.columns if i != 'Model']
31
+ chart_data['Average'] = chart_data[selected_columns].mean(axis=1)
32
+
33
+ # new_dataset_name = dataset_name.replace('-', '_').lower()
34
+
35
+ st.markdown("""
36
+ <style>
37
+ .stMultiSelect [data-baseweb=select] span {
38
+ max-width: 800px;
39
+ font-size: 0.9rem;
40
+ background-color: #3C6478 !important; /* Background color for selected items */
41
+ color: white; /* Change text color */
42
+ back
43
+ }
44
+ </style>
45
+ """, unsafe_allow_html=True)
46
+
47
+ # remap model names
48
+ display_model_names = {key.strip() :val.strip() for key, val in zip(info_df['Original Name'], info_df['Proper Display Name'])}
49
+ chart_data['model_show'] = chart_data['Model'].map(lambda x: display_model_names.get(x, x))
50
+
51
+ models = st.multiselect("Please choose the model",
52
+ sorted(chart_data['model_show'].tolist()),
53
+ default = sorted(chart_data['model_show'].tolist()),
54
+ key=f"multiselect_{task_name}_{metrics}"
55
+ )
56
+
57
+ chart_data = chart_data[chart_data['model_show'].isin(models)].dropna(axis=0)
58
+ # chart_data = chart_data.sort_values(by=['Average'], ascending=True).dropna(axis=0)
59
+
60
+ if len(chart_data) == 0: return
61
+
62
+ # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
63
+ '''
64
+ Show Table
65
+ '''
66
+ with st.container():
67
+ st.markdown(f'#### Overal Evaluation Results')
68
+ st.markdown(f'###### Evaluation Method: {metrics_info[metrics]}')
69
+
70
+ model_link = {key.strip(): val for key, val in zip(info_df['Proper Display Name'], info_df['Link'])}
71
+
72
+ chart_data['model_link'] = chart_data['model_show'].map(model_link)
73
+
74
+ tabel_columns = [i for i in chart_data.columns if i not in ['Model', 'model_show']]
75
+ column_to_front = 'Average'
76
+ new_order = [column_to_front] + [col for col in tabel_columns if col != column_to_front]
77
+
78
+ chart_data_table = chart_data[['model_show'] + new_order]
79
+
80
+
81
+ # Format numeric columns to 2 decimal places
82
+ chart_data_table[chart_data_table.columns[1]] = chart_data_table[chart_data_table.columns[1]].apply(lambda x: round(float(x), 3) if isinstance(float(x), (int, float)) else float(x))
83
+
84
+ if metrics in ['wer']:
85
+ ascend = True
86
+ else:
87
+ ascend= False
88
+
89
+ chart_data_table = chart_data_table.sort_values(
90
+ by=['Average'],
91
+ ascending=ascend
92
+ ).reset_index(drop=True)
93
+
94
+ def highlight_first_element(x):
95
+ # Create a DataFrame with the same shape as the input
96
+ df_style = pd.DataFrame('', index=x.index, columns=x.columns)
97
+
98
+ # Apply background color to the first element in row 0 (df[0][0])
99
+ df_style.iloc[0, 1] = 'background-color: #b0c1d7; color: white'
100
+
101
+ return df_style
102
+
103
+ styled_df = chart_data_table.style.apply(
104
+ highlight_first_element, axis=None
105
+ )
106
+
107
+ st.dataframe(
108
+ styled_df,
109
+ column_config={
110
+ 'model_show': 'Model',
111
+ chart_data_table.columns[1]: {'alignment': 'left'},
112
+ "model_link": st.column_config.LinkColumn(
113
+ "Model Link",
114
+ ),
115
+ },
116
+ hide_index=True,
117
+ use_container_width=True
118
+ )