zhuohan-7 commited on
Commit
1d32376
1 Parent(s): 84f9e61

Upload folder using huggingface_hub

Browse files
Files changed (4) hide show
  1. app/content.py +46 -0
  2. app/draw_diagram.py +14 -21
  3. app/pages.py +57 -42
  4. app/summarization.py +95 -89
app/content.py CHANGED
@@ -67,4 +67,50 @@ metrics = {
67
  'llama3_70b_judge': 'General evaluation using the LLAMA3-70B model, typically scoring based on subjective judgments. (0-100 based on score 0-5)',
68
  'meteor': 'METEOR, a metric used for evaluating text generation, often used in translation or summarization tasks. (Sensitive to output length)',
69
  'bleu': 'BLEU (Bilingual Evaluation Understudy), another text generation evaluation metric commonly used in machine translation. (Sensitive to output length)',
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  }
 
67
  'llama3_70b_judge': 'General evaluation using the LLAMA3-70B model, typically scoring based on subjective judgments. (0-100 based on score 0-5)',
68
  'meteor': 'METEOR, a metric used for evaluating text generation, often used in translation or summarization tasks. (Sensitive to output length)',
69
  'bleu': 'BLEU (Bilingual Evaluation Understudy), another text generation evaluation metric commonly used in machine translation. (Sensitive to output length)',
70
+ }
71
+
72
+ metrics_info = {
73
+ 'wer': 'Word Error Rate (WER) - The Lower, the better.',
74
+ 'llama3_70b_judge_binary': 'Model-as-a-Judge Peformance. Using LLAMA-3-70B. Scale from 0-100. The higher, the better.',
75
+ 'llama3_70b_judge': 'Model-as-a-Judge Peformance. Using LLAMA-3-70B. Scale from 0-100. The higher, the better.',
76
+ 'meteor': 'METEOR Score. The higher, the better.',
77
+ 'bleu': 'BLEU Score. The higher, the better.',
78
+ }
79
+
80
+ dataname_column_rename_in_table = {
81
+ 'librispeech_test_clean' : 'LibriSpeech-Clean',
82
+ 'librispeech_test_other' : 'LibriSpeech-Other',
83
+ 'common_lvoice_15_en_test': 'CommonVoice-15-EN',
84
+ 'peoples_speech_test' : 'Peoples-Speech',
85
+ 'gigaspeech_test' : 'GigaSpeech-1',
86
+ 'earnings21_test' : 'Earnings-21',
87
+ 'earnings22_test' : 'Earnings-22',
88
+ 'tedlium3_test' : 'TED-LIUM-3',
89
+ 'tedlium3_long_form_test': 'TED-LIUM-3-Long',
90
+ 'aishel_asr_zh_test' : 'Aishell-ASR-ZH',
91
+ 'covost2_en_id_test' : 'Covost2-EN-ID',
92
+ 'covost2_en_zh_test' : 'Covost2-EN-ZH',
93
+ 'covost2_en_ta_test' : 'Covost2-EN-TA',
94
+ 'covost2_id_en_test' : 'Covost2-ID-EN',
95
+ 'covost2_zh_en_test' : 'Covost2-ZH-EN',
96
+ 'covost2_ta_en_test' : 'Covost2-TA-EN',
97
+ 'cn_college_listen_mcq_test': 'CN-College-Listen-MCQ',
98
+ 'dream_tts_mcq_test' : 'DREAM-TTS-MCQ',
99
+ 'slue_p2_sqa5_test' : 'SLUE-P2-SQA5',
100
+ 'public_sg_speech_qa_test': 'Public-SG-Speech-QA',
101
+ 'spoken_squad_test' : 'Spoken-SQuAD',
102
+ 'openhermes_audio_test' : 'OpenHermes-Audio',
103
+ 'alpaca_audio_test' : 'ALPACA-Audio',
104
+ 'wavcaps_test' : 'WavCaps',
105
+ 'audiocaps_test' : 'AudioCaps',
106
+ 'clotho_aqa_test' : 'Clotho-AQA',
107
+ 'wavcaps_qa_test' : 'WavCaps-QA',
108
+ 'audiocaps_qa_test' : 'AudioCaps-QA',
109
+ 'voxceleb_accent_test' : 'VoxCeleb-Accent',
110
+ 'voxceleb_gender_test' : 'VoxCeleb-Gender',
111
+ 'iemocap_gender_test': 'IEMOCAP-Gender',
112
+ 'iemocap_emotion_test': 'IEMOCAP-Emotion',
113
+ 'meld_sentiment_test': 'MELD-Sentiment',
114
+ 'meld_emotion_test': 'MELD-Emotion',
115
+
116
  }
app/draw_diagram.py CHANGED
@@ -5,39 +5,30 @@ from streamlit_echarts import st_echarts
5
  from streamlit.components.v1 import html
6
  # from PIL import Image
7
  from app.show_examples import *
 
8
  import pandas as pd
9
 
10
  from model_information import get_dataframe
11
 
12
 
13
- # huggingface_image = Image.open('style/huggingface.jpg')
14
-
15
- # other info
16
- # path = "./AudioBench-Leaderboard/additional_info/Leaderboard-Rename.xlsx"
17
- # path = "./additional_info/Leaderboard-Rename.xlsx"
18
-
19
- # info_df = pd.read_excel(path)
20
 
21
  info_df = get_dataframe()
22
 
23
- # def nav_to(value):
24
- # try:
25
- # url = links_dic[str(value).lower()]
26
- # js = f'window.open("{url}", "_blank").then(r => window.parent.location.href);'
27
- # st_javascript(js)
28
- # except:
29
- # pass
30
 
31
  def draw(folder_name, category_name, dataset_name, metrics, cus_sort=True):
32
 
33
  folder = f"./results/{metrics}/"
34
 
35
-
36
  data_path = f'{folder}/{category_name.lower()}.csv'
37
  chart_data = pd.read_csv(data_path).round(3)
38
  new_dataset_name = dataset_name.replace('-', '_').lower()
39
  chart_data = chart_data[['Model', new_dataset_name]]
40
-
 
 
 
 
41
  st.markdown("""
42
  <style>
43
  .stMultiSelect [data-baseweb=select] span {
@@ -253,10 +244,12 @@ def draw(folder_name, category_name, dataset_name, metrics, cus_sort=True):
253
  st.session_state.show_examples = not st.session_state.show_examples
254
 
255
  if st.session_state.show_examples:
 
 
256
 
257
- # if dataset_name in ['Earnings21-Test', 'Earnings22-Test', 'Tedlium3-Test', 'Tedlium3-Long-form-Test']:
258
- if dataset_name in []:
259
- pass
260
- else:
261
- show_examples(category_name, dataset_name, chart_data['Model'].tolist(), display_model_names)
262
 
 
5
  from streamlit.components.v1 import html
6
  # from PIL import Image
7
  from app.show_examples import *
8
+ from app.content import *
9
  import pandas as pd
10
 
11
  from model_information import get_dataframe
12
 
13
 
 
 
 
 
 
 
 
14
 
15
  info_df = get_dataframe()
16
 
 
 
 
 
 
 
 
17
 
18
  def draw(folder_name, category_name, dataset_name, metrics, cus_sort=True):
19
 
20
  folder = f"./results/{metrics}/"
21
 
22
+ # Load the results from CSV
23
  data_path = f'{folder}/{category_name.lower()}.csv'
24
  chart_data = pd.read_csv(data_path).round(3)
25
  new_dataset_name = dataset_name.replace('-', '_').lower()
26
  chart_data = chart_data[['Model', new_dataset_name]]
27
+
28
+ # Rename to proper display name
29
+ new_dataset_name = dataname_column_rename_in_table[new_dataset_name]
30
+ chart_data = chart_data.rename(columns=dataname_column_rename_in_table)
31
+
32
  st.markdown("""
33
  <style>
34
  .stMultiSelect [data-baseweb=select] span {
 
244
  st.session_state.show_examples = not st.session_state.show_examples
245
 
246
  if st.session_state.show_examples:
247
+
248
+ st.markdown('To be implemented')
249
 
250
+ # # if dataset_name in ['Earnings21-Test', 'Earnings22-Test', 'Tedlium3-Test', 'Tedlium3-Long-form-Test']:
251
+ # if dataset_name in []:
252
+ # pass
253
+ # else:
254
+ # show_examples(category_name, dataset_name, chart_data['Model'].tolist(), display_model_names)
255
 
app/pages.py CHANGED
@@ -88,9 +88,9 @@ def dashboard():
88
  def asr():
89
  st.title("Task: Automatic Speech Recognition")
90
 
91
- sum = ['Summarization']
92
-
93
- dataset_lists = ['LibriSpeech-Test-Clean',
94
  'LibriSpeech-Test-Other',
95
  'Common-Voice-15-En-Test',
96
  'Peoples-Speech-Test',
@@ -99,13 +99,11 @@ def asr():
99
  'Earnings22-Test',
100
  'Tedlium3-Test',
101
  'Tedlium3-Long-form-Test',
102
- #'IMDA-Part1-ASR-Test',
103
- #'IMDA-Part2-ASR-Test'
104
  ]
105
 
106
  filters_levelone = sum + dataset_lists
107
 
108
- left, center, _, middle,right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
109
 
110
  with left:
111
  filter_1 = st.selectbox('Dataset', filters_levelone)
@@ -117,12 +115,35 @@ def asr():
117
  dataset_contents(asr_datsets[filter_1], metrics['wer'])
118
  draw('su', 'ASR', filter_1, 'wer', cus_sort=True)
119
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
 
121
 
122
  def sqa():
123
  st.title("Task: Speech Question Answering")
124
 
125
- sum = ['Summarization']
126
 
127
  binary = ['CN-College-Listen-MCQ-Test', 'DREAM-TTS-MCQ-Test']
128
 
@@ -132,7 +153,7 @@ def sqa():
132
 
133
  filters_levelone = sum + binary + rest
134
 
135
- left, center, _, middle,right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
136
 
137
  with left:
138
  filter_1 = st.selectbox('Dataset', filters_levelone)
@@ -152,14 +173,14 @@ def sqa():
152
  def si():
153
  st.title("Task: Speech Instruction")
154
 
155
- sum = ['Summarization']
156
 
157
  dataset_lists = ['OpenHermes-Audio-Test',
158
  'ALPACA-Audio-Test']
159
 
160
  filters_levelone = sum + dataset_lists
161
 
162
- left, center, _, middle,right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
163
 
164
  with left:
165
  filter_1 = st.selectbox('Dataset', filters_levelone)
@@ -178,7 +199,7 @@ def ac():
178
  'AudioCaps-Test']
179
  filters_leveltwo = ['Llama3-70b-judge', 'Meteor']
180
 
181
- left, center, _, middle,right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
182
 
183
  with left:
184
  filter_1 = st.selectbox('Dataset', filters_levelone)
@@ -193,7 +214,7 @@ def ac():
193
  def asqa():
194
  st.title("Task: Audio Scene Question Answering")
195
 
196
- sum = ['Summarization']
197
 
198
  dataset_lists = ['Clotho-AQA-Test',
199
  'WavCaps-QA-Test',
@@ -201,7 +222,7 @@ def asqa():
201
 
202
  filters_levelone = sum + dataset_lists
203
 
204
- left, center, _, middle,right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
205
 
206
  with left:
207
  filter_1 = st.selectbox('Dataset', filters_levelone)
@@ -211,13 +232,13 @@ def asqa():
211
  sum_table_mulit_metrix('AQA', ['llama3_70b_judge'])
212
  else:
213
  dataset_contents(asqa_datasets[filter_1], metrics['llama3_70b_judge'])
214
- draw('asu', 'AQA',filter_1, 'llama3_70b_judge')
215
 
216
 
217
  def er():
218
  st.title("Task: Emotion Recognition")
219
 
220
- sum = ['Summarization']
221
 
222
  dataset_lists = ['IEMOCAP-Emotion-Test',
223
  'MELD-Sentiment-Test',
@@ -225,7 +246,7 @@ def er():
225
 
226
  filters_levelone = sum + dataset_lists
227
 
228
- left, center, _, middle,right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
229
 
230
  with left:
231
  filter_1 = st.selectbox('Dataset', filters_levelone)
@@ -241,32 +262,38 @@ def er():
241
  def ar():
242
  st.title("Task: Accent Recognition")
243
 
244
- filters_levelone = ['VoxCeleb-Accent-Test']
 
 
 
 
245
 
246
- left, center, _, middle,right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
247
 
248
  with left:
249
  filter_1 = st.selectbox('Dataset', filters_levelone)
250
 
251
 
252
  if filter_1:
253
- # if filter_1 in sum:
254
- # sum_table('aR', 'llama3_70b_judge')
255
- # else:
256
- dataset_contents(ar_datsets[filter_1], metrics['llama3_70b_judge'])
257
- draw('vu', 'AR', filter_1, 'llama3_70b_judge')
 
258
 
259
 
260
  def gr():
261
  st.title("Task: Gender Recognition")
262
- sum = ['Summarization']
 
263
 
264
  dataset_lists = ['VoxCeleb-Gender-Test',
265
  'IEMOCAP-Gender-Test']
266
 
267
  filters_levelone = sum + dataset_lists
268
 
269
- left, center, _, middle,right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
270
 
271
  with left:
272
  filter_1 = st.selectbox('Dataset', filters_levelone)
@@ -281,8 +308,10 @@ def gr():
281
 
282
  def spt():
283
  st.title("Task: Speech Translation")
284
- sum = ['Summarization']
285
- dataset_lists = ['Covost2-EN-ID-test',
 
 
286
  'Covost2-EN-ZH-test',
287
  'Covost2-EN-TA-test',
288
  'Covost2-ID-EN-test',
@@ -291,7 +320,7 @@ def spt():
291
 
292
  filters_levelone = sum + dataset_lists
293
 
294
- left, center, _, middle,right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
295
 
296
  with left:
297
  filter_1 = st.selectbox('Dataset', filters_levelone)
@@ -303,17 +332,3 @@ def spt():
303
  dataset_contents(spt_datasets[filter_1], metrics['bleu'])
304
  draw('su', 'ST', filter_1, 'bleu')
305
 
306
-
307
- def cnasr():
308
- st.title("Task: Automatic Speech Recognition (Chinese)")
309
-
310
- filters_levelone = ['Aishell-ASR-ZH-Test']
311
-
312
- left, center, _, middle,right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
313
-
314
- with left:
315
- filter_1 = st.selectbox('Dataset', filters_levelone)
316
-
317
- if filter_1:
318
- dataset_contents(cnasr_datasets[filter_1], metrics['wer'])
319
- draw('su', 'CNASR', filter_1, 'wer')
 
88
  def asr():
89
  st.title("Task: Automatic Speech Recognition")
90
 
91
+ sum = ['Overall']
92
+ dataset_lists = [
93
+ 'LibriSpeech-Test-Clean',
94
  'LibriSpeech-Test-Other',
95
  'Common-Voice-15-En-Test',
96
  'Peoples-Speech-Test',
 
99
  'Earnings22-Test',
100
  'Tedlium3-Test',
101
  'Tedlium3-Long-form-Test',
 
 
102
  ]
103
 
104
  filters_levelone = sum + dataset_lists
105
 
106
+ left, center, _, middle, right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
107
 
108
  with left:
109
  filter_1 = st.selectbox('Dataset', filters_levelone)
 
115
  dataset_contents(asr_datsets[filter_1], metrics['wer'])
116
  draw('su', 'ASR', filter_1, 'wer', cus_sort=True)
117
 
118
+
119
+ def cnasr():
120
+ st.title("Task: Automatic Speech Recognition - Mandarin")
121
+
122
+ sum = ['Overall']
123
+ dataset_lists = [
124
+ 'Aishell-ASR-ZH-Test',
125
+ ]
126
+
127
+ filters_levelone = sum + dataset_lists
128
+
129
+ left, center, _, middle, right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
130
+
131
+ with left:
132
+ filter_1 = st.selectbox('Dataset', filters_levelone)
133
+
134
+ if filter_1:
135
+ if filter_1 in sum:
136
+ sum_table_mulit_metrix('CNASR', ['wer'])
137
+ else:
138
+ dataset_contents(cnasr_datasets[filter_1], metrics['wer'])
139
+ draw('su', 'CNASR', filter_1, 'wer')
140
+
141
 
142
 
143
  def sqa():
144
  st.title("Task: Speech Question Answering")
145
 
146
+ sum = ['Overall']
147
 
148
  binary = ['CN-College-Listen-MCQ-Test', 'DREAM-TTS-MCQ-Test']
149
 
 
153
 
154
  filters_levelone = sum + binary + rest
155
 
156
+ left, center, _, middle, right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
157
 
158
  with left:
159
  filter_1 = st.selectbox('Dataset', filters_levelone)
 
173
  def si():
174
  st.title("Task: Speech Instruction")
175
 
176
+ sum = ['Overall']
177
 
178
  dataset_lists = ['OpenHermes-Audio-Test',
179
  'ALPACA-Audio-Test']
180
 
181
  filters_levelone = sum + dataset_lists
182
 
183
+ left, center, _, middle, right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
184
 
185
  with left:
186
  filter_1 = st.selectbox('Dataset', filters_levelone)
 
199
  'AudioCaps-Test']
200
  filters_leveltwo = ['Llama3-70b-judge', 'Meteor']
201
 
202
+ left, center, _, middle, right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
203
 
204
  with left:
205
  filter_1 = st.selectbox('Dataset', filters_levelone)
 
214
  def asqa():
215
  st.title("Task: Audio Scene Question Answering")
216
 
217
+ sum = ['Overall']
218
 
219
  dataset_lists = ['Clotho-AQA-Test',
220
  'WavCaps-QA-Test',
 
222
 
223
  filters_levelone = sum + dataset_lists
224
 
225
+ left, center, _, middle, right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
226
 
227
  with left:
228
  filter_1 = st.selectbox('Dataset', filters_levelone)
 
232
  sum_table_mulit_metrix('AQA', ['llama3_70b_judge'])
233
  else:
234
  dataset_contents(asqa_datasets[filter_1], metrics['llama3_70b_judge'])
235
+ draw('asu', 'AQA', filter_1, 'llama3_70b_judge')
236
 
237
 
238
  def er():
239
  st.title("Task: Emotion Recognition")
240
 
241
+ sum = ['Overall']
242
 
243
  dataset_lists = ['IEMOCAP-Emotion-Test',
244
  'MELD-Sentiment-Test',
 
246
 
247
  filters_levelone = sum + dataset_lists
248
 
249
+ left, center, _, middle, right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
250
 
251
  with left:
252
  filter_1 = st.selectbox('Dataset', filters_levelone)
 
262
  def ar():
263
  st.title("Task: Accent Recognition")
264
 
265
+ sum = ['Overall']
266
+ dataset_lists = ['VoxCeleb-Accent-Test']
267
+
268
+
269
+ filters_levelone = sum + dataset_lists
270
 
271
+ left, center, _, middle, right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
272
 
273
  with left:
274
  filter_1 = st.selectbox('Dataset', filters_levelone)
275
 
276
 
277
  if filter_1:
278
+ if filter_1 in sum:
279
+ sum_table_mulit_metrix('AR', ['llama3_70b_judge'])
280
+ # sum_table('aR', 'llama3_70b_judge')
281
+ else:
282
+ dataset_contents(ar_datsets[filter_1], metrics['llama3_70b_judge'])
283
+ draw('vu', 'AR', filter_1, 'llama3_70b_judge')
284
 
285
 
286
  def gr():
287
  st.title("Task: Gender Recognition")
288
+
289
+ sum = ['Overall']
290
 
291
  dataset_lists = ['VoxCeleb-Gender-Test',
292
  'IEMOCAP-Gender-Test']
293
 
294
  filters_levelone = sum + dataset_lists
295
 
296
+ left, center, _, middle, right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
297
 
298
  with left:
299
  filter_1 = st.selectbox('Dataset', filters_levelone)
 
308
 
309
  def spt():
310
  st.title("Task: Speech Translation")
311
+
312
+ sum = ['Overall']
313
+ dataset_lists = [
314
+ 'Covost2-EN-ID-test',
315
  'Covost2-EN-ZH-test',
316
  'Covost2-EN-TA-test',
317
  'Covost2-ID-EN-test',
 
320
 
321
  filters_levelone = sum + dataset_lists
322
 
323
+ left, center, _, middle, right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
324
 
325
  with left:
326
  filter_1 = st.selectbox('Dataset', filters_levelone)
 
332
  dataset_contents(spt_datasets[filter_1], metrics['bleu'])
333
  draw('su', 'ST', filter_1, 'bleu')
334
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app/summarization.py CHANGED
@@ -5,6 +5,8 @@ from streamlit_echarts import st_echarts
5
  from streamlit.components.v1 import html
6
  # from PIL import Image
7
  from app.show_examples import *
 
 
8
  import pandas as pd
9
  from typing import List
10
 
@@ -12,107 +14,111 @@ from model_information import get_dataframe
12
 
13
  info_df = get_dataframe()
14
 
15
- metrics_info = {
16
- 'wer': 'Word Error Rate (WER), a common metric for ASR evaluation. (The lower, the better)',
17
- 'llama3_70b_judge_binary': 'Binary evaluation using the LLAMA3-70B model, for tasks requiring a binary outcome. (0-100 based on score 0-1)',
18
- 'llama3_70b_judge': 'General evaluation using the LLAMA3-70B model, typically scoring based on subjective judgments. (0-100 based on score 0-5)',
19
- 'meteor': 'METEOR, a metric used for evaluating text generation, often used in translation or summarization tasks. (Sensitive to output length)',
20
- 'bleu': 'BLEU (Bilingual Evaluation Understudy), another text generation evaluation metric commonly used in machine translation. (Sensitive to output length)',
21
- }
22
 
23
  def sum_table_mulit_metrix(task_name, metrics_lists: List[str]):
24
-
 
 
25
  for metrics in metrics_lists:
26
  folder = f"./results/{metrics}/"
27
  data_path = f'{folder}/{task_name.lower()}.csv'
 
 
 
 
 
 
28
 
29
- chart_data = pd.read_csv(data_path).round(3)
30
- selected_columns = [i for i in chart_data.columns if i != 'Model']
31
- chart_data['Average'] = chart_data[selected_columns].mean(axis=1)
32
-
33
- # new_dataset_name = dataset_name.replace('-', '_').lower()
34
-
35
- st.markdown("""
36
- <style>
37
- .stMultiSelect [data-baseweb=select] span {
38
- max-width: 800px;
39
- font-size: 0.9rem;
40
- background-color: #3C6478 !important; /* Background color for selected items */
41
- color: white; /* Change text color */
42
- back
43
- }
44
- </style>
45
- """, unsafe_allow_html=True)
46
-
47
- # remap model names
48
- display_model_names = {key.strip() :val.strip() for key, val in zip(info_df['Original Name'], info_df['Proper Display Name'])}
49
- chart_data['model_show'] = chart_data['Model'].map(lambda x: display_model_names.get(x, x))
50
-
51
- models = st.multiselect("Please choose the model",
52
- sorted(chart_data['model_show'].tolist()),
53
- default = sorted(chart_data['model_show'].tolist()),
54
- key=f"multiselect_{task_name}_{metrics}"
55
- )
56
-
57
- chart_data = chart_data[chart_data['model_show'].isin(models)].dropna(axis=0)
58
- # chart_data = chart_data.sort_values(by=['Average'], ascending=True).dropna(axis=0)
59
 
60
- if len(chart_data) == 0: return
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
 
62
- # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
63
- '''
64
- Show Table
65
- '''
66
- with st.container():
67
- st.markdown(f'#### Overal Evaluation Results')
68
- st.markdown(f'###### Evaluation Method: {metrics_info[metrics]}')
 
69
 
70
- model_link = {key.strip(): val for key, val in zip(info_df['Proper Display Name'], info_df['Link'])}
71
 
72
- chart_data['model_link'] = chart_data['model_show'].map(model_link)
 
 
 
 
 
73
 
74
- tabel_columns = [i for i in chart_data.columns if i not in ['Model', 'model_show']]
75
- column_to_front = 'Average'
76
- new_order = [column_to_front] + [col for col in tabel_columns if col != column_to_front]
77
-
78
- chart_data_table = chart_data[['model_show'] + new_order]
79
-
80
 
81
- # Format numeric columns to 2 decimal places
82
- chart_data_table[chart_data_table.columns[1]] = chart_data_table[chart_data_table.columns[1]].apply(lambda x: round(float(x), 3) if isinstance(float(x), (int, float)) else float(x))
83
 
84
- if metrics in ['wer']:
85
- ascend = True
86
- else:
87
- ascend= False
 
 
88
 
89
- chart_data_table = chart_data_table.sort_values(
90
- by=['Average'],
91
- ascending=ascend
92
- ).reset_index(drop=True)
93
-
94
- def highlight_first_element(x):
95
- # Create a DataFrame with the same shape as the input
96
- df_style = pd.DataFrame('', index=x.index, columns=x.columns)
97
-
98
- # Apply background color to the first element in row 0 (df[0][0])
99
- df_style.iloc[0, 1] = 'background-color: #b0c1d7; color: white'
100
-
101
- return df_style
102
-
103
- styled_df = chart_data_table.style.apply(
104
- highlight_first_element, axis=None
105
- )
106
 
107
- st.dataframe(
108
- styled_df,
109
- column_config={
110
- 'model_show': 'Model',
111
- chart_data_table.columns[1]: {'alignment': 'left'},
112
- "model_link": st.column_config.LinkColumn(
113
- "Model Link",
114
- ),
115
- },
116
- hide_index=True,
117
- use_container_width=True
118
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  from streamlit.components.v1 import html
6
  # from PIL import Image
7
  from app.show_examples import *
8
+ from app.content import *
9
+
10
  import pandas as pd
11
  from typing import List
12
 
 
14
 
15
  info_df = get_dataframe()
16
 
17
+ metrics_info = metrics_info
 
 
 
 
 
 
18
 
19
  def sum_table_mulit_metrix(task_name, metrics_lists: List[str]):
20
+
21
+ # combine chart data from multiple sources
22
+ chart_data = pd.DataFrame()
23
  for metrics in metrics_lists:
24
  folder = f"./results/{metrics}/"
25
  data_path = f'{folder}/{task_name.lower()}.csv'
26
+ one_chart_data = pd.read_csv(data_path).round(3)
27
+ if len(chart_data) == 0:
28
+ chart_data = one_chart_data
29
+ else:
30
+ chart_data = pd.merge(chart_data, one_chart_data, on='Model', how='outer')
31
+
32
 
33
+ selected_columns = [i for i in chart_data.columns if i != 'Model']
34
+ chart_data['Average'] = chart_data[selected_columns].mean(axis=1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
 
36
+ # Update dataset name in table
37
+ chart_data = chart_data.rename(columns=dataname_column_rename_in_table)
38
+
39
+ st.markdown("""
40
+ <style>
41
+ .stMultiSelect [data-baseweb=select] span {
42
+ max-width: 800px;
43
+ font-size: 0.9rem;
44
+ background-color: #3C6478 !important; /* Background color for selected items */
45
+ color: white; /* Change text color */
46
+ back
47
+ }
48
+ </style>
49
+ """, unsafe_allow_html=True)
50
+
51
+ # remap model names
52
+ display_model_names = {key.strip() :val.strip() for key, val in zip(info_df['Original Name'], info_df['Proper Display Name'])}
53
+ chart_data['model_show'] = chart_data['Model'].map(lambda x: display_model_names.get(x, x))
54
 
55
+ models = st.multiselect("Please choose the model",
56
+ sorted(chart_data['model_show'].tolist()),
57
+ default = sorted(chart_data['model_show'].tolist()),
58
+ # key=f"multiselect_{task_name}_{metrics}"
59
+ )
60
+
61
+ chart_data = chart_data[chart_data['model_show'].isin(models)].dropna(axis=0)
62
+ # chart_data = chart_data.sort_values(by=['Average'], ascending=True).dropna(axis=0)
63
 
64
+ if len(chart_data) == 0: return
65
 
66
+ # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
67
+ '''
68
+ Show Table
69
+ '''
70
+ with st.container():
71
+ st.markdown(f'##### TABLE')
72
 
73
+ model_link = {key.strip(): val for key, val in zip(info_df['Proper Display Name'], info_df['Link'])}
 
 
 
 
 
74
 
75
+ chart_data['model_link'] = chart_data['model_show'].map(model_link)
 
76
 
77
+ tabel_columns = [i for i in chart_data.columns if i not in ['Model', 'model_show']]
78
+ column_to_front = 'Average'
79
+ new_order = [column_to_front] + [col for col in tabel_columns if col != column_to_front]
80
+
81
+ chart_data_table = chart_data[['model_show'] + new_order]
82
+
83
 
84
+ # Format numeric columns to 2 decimal places
85
+ chart_data_table[chart_data_table.columns[1]] = chart_data_table[chart_data_table.columns[1]].apply(lambda x: round(float(x), 3) if isinstance(float(x), (int, float)) else float(x))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
 
87
+ if metrics in ['wer']:
88
+ ascend = True
89
+ else:
90
+ ascend= False
91
+
92
+ chart_data_table = chart_data_table.sort_values(
93
+ by=['Average'],
94
+ ascending=ascend
95
+ ).reset_index(drop=True)
96
+
97
+ # Highlight the best performing model
98
+ def highlight_first_element(x):
99
+ # Create a DataFrame with the same shape as the input
100
+ df_style = pd.DataFrame('', index=x.index, columns=x.columns)
101
+ # Apply background color to the first element in row 0 (df[0][0])
102
+ df_style.iloc[0, 1] = 'background-color: #b0c1d7; color: white'
103
+ return df_style
104
+
105
+ styled_df = chart_data_table.style.apply(
106
+ highlight_first_element, axis=None
107
+ )
108
+
109
+ st.dataframe(
110
+ styled_df,
111
+ column_config={
112
+ 'model_show': 'Model',
113
+ chart_data_table.columns[1]: {'alignment': 'left'},
114
+ "model_link": st.column_config.LinkColumn(
115
+ "Model Link",
116
+ ),
117
+ },
118
+ hide_index=True,
119
+ use_container_width=True
120
+ )
121
+
122
+ #for metrics in metrics_lists:
123
+ # Only report the last metrics
124
+ st.markdown(f'###### Metric: {metrics_info[metrics]}')