import streamlit as st from app.draw_diagram import * from app.content import * from app.summarization import * def dataset_contents(dataset, metrics): custom_css = """ """ st.markdown(custom_css, unsafe_allow_html=True) st.markdown(f"""

About this dataset: {dataset}

""", unsafe_allow_html=True) st.markdown(f"""

About this metric: {metrics}

""", unsafe_allow_html=True) def dashboard(): with st.container(): st.title("AudioBench") st.markdown(""" [gh]: https://github.com/AudioLLMs/AudioBench [![GitHub Repo stars](https://img.shields.io/github/stars/AudioLLMs/AudioBench?style=social)][gh] [![GitHub watchers](https://img.shields.io/github/watchers/AudioLLMs/AudioBench?style=social)][gh] """) audio_url = "https://arxiv.org/abs/2406.16020" st.markdown("#### News") st.markdown("**Dec, 2024**: Update layout and support comparison between models with similar model sizes. Layout reorganized for better user experience. Add performance summary for each task.") st.markdown("**Sep, 2024**: Initial leaderboard online.") st.divider() st.markdown("#### What is [AudioBench](%s)?" % audio_url) st.markdown("##### :dizzy: A comprehensive evaluation benchmark designed for general instruction-following audio large language models.") st.markdown("##### :dizzy: A evaluation benchmark that we consistently put effort in updating and maintaining.") st.markdown(''' ''') with st.container(): left_co, center_co, right_co = st.columns([0.5,1, 0.5]) with center_co: st.image("./style/audio_overview.png", caption="Overview of the datasets in AudioBench.", # use_container_width = True ) st.markdown(''' ''') st.markdown("###### :dart: Our Benchmark includes: ") cols = st.columns(10) cols[1].metric(label="Tasks", value=">8") #delta="Tasks", delta_color="off" cols[2].metric(label="Datasets", value=">30") cols[3].metric(label="Evaluated Models", value=">5") st.divider() with st.container(): st.markdown("##### Citations") st.markdown(''' :round_pushpin: AudioBench Paper \n @article{wang2024audiobench, title={AudioBench: A Universal Benchmark for Audio Large Language Models}, author={Wang, Bin and Zou, Xunlong and Lin, Geyu and Sun, Shuo and Liu, Zhuohan and Zhang, Wenyu and Liu, Zhengyuan and Aw, AiTi and Chen, Nancy F}, journal={arXiv preprint arXiv:2406.16020}, year={2024} } ''') def asr(): st.title("Task: Automatic Speech Recognition") sum = ['Overall'] dataset_lists = [ 'LibriSpeech-Test-Clean', 'LibriSpeech-Test-Other', 'Common-Voice-15-En-Test', 'Peoples-Speech-Test', 'GigaSpeech-Test', 'Earnings21-Test', 'Earnings22-Test', 'Tedlium3-Test', 'Tedlium3-Long-form-Test', ] filters_levelone = sum + dataset_lists left, center, _, middle, right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2]) with left: filter_1 = st.selectbox('Dataset', filters_levelone) if filter_1: if filter_1 in sum: sum_table_mulit_metrix('ASR', ['wer']) else: dataset_contents(asr_datsets[filter_1], metrics['wer']) draw('su', 'ASR', filter_1, 'wer', cus_sort=True) def cnasr(): st.title("Task: Automatic Speech Recognition - Mandarin") sum = ['Overall'] dataset_lists = [ 'Aishell-ASR-ZH-Test', ] filters_levelone = sum + dataset_lists left, center, _, middle, right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2]) with left: filter_1 = st.selectbox('Dataset', filters_levelone) if filter_1: if filter_1 in sum: sum_table_mulit_metrix('CNASR', ['wer']) else: dataset_contents(cnasr_datasets[filter_1], metrics['wer']) draw('su', 'CNASR', filter_1, 'wer') def sqa(): st.title("Task: Speech Question Answering") sum = ['Overall'] binary = ['CN-College-Listen-MCQ-Test', 'DREAM-TTS-MCQ-Test'] rest = ['SLUE-P2-SQA5-Test', 'Public-SG-Speech-QA-Test', 'Spoken-Squad-Test'] filters_levelone = sum + binary + rest left, center, _, middle, right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2]) with left: filter_1 = st.selectbox('Dataset', filters_levelone) if filter_1: if filter_1 in sum: sum_table_mulit_metrix('SQA', ['llama3_70b_judge_binary', 'llama3_70b_judge']) elif filter_1 in binary: dataset_contents(sqa_datasets[filter_1], metrics['llama3_70b_judge_binary']) draw('su', 'SQA', filter_1, 'llama3_70b_judge_binary') else: dataset_contents(sqa_datasets[filter_1], metrics['llama3_70b_judge']) draw('su', 'SQA', filter_1, 'llama3_70b_judge') def si(): st.title("Task: Speech Instruction") sum = ['Overall'] dataset_lists = ['OpenHermes-Audio-Test', 'ALPACA-Audio-Test'] filters_levelone = sum + dataset_lists left, center, _, middle, right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2]) with left: filter_1 = st.selectbox('Dataset', filters_levelone) if filter_1: if filter_1 in sum: sum_table_mulit_metrix('SI', ['llama3_70b_judge']) else: dataset_contents(si_datasets[filter_1], metrics['llama3_70b_judge']) draw('su', 'SI', filter_1, 'llama3_70b_judge') def ac(): st.title("Task: Audio Captioning") filters_levelone = ['WavCaps-Test', 'AudioCaps-Test'] filters_leveltwo = ['Llama3-70b-judge', 'Meteor'] left, center, _, middle, right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2]) with left: filter_1 = st.selectbox('Dataset', filters_levelone) with middle: metric = st.selectbox('Metric', filters_leveltwo) if filter_1 or metric: dataset_contents(ac_datasets[filter_1], metrics[metric.lower().replace('-', '_')]) draw('asu', 'AC',filter_1, metric.lower().replace('-', '_')) def asqa(): st.title("Task: Audio Scene Question Answering") sum = ['Overall'] dataset_lists = ['Clotho-AQA-Test', 'WavCaps-QA-Test', 'AudioCaps-QA-Test'] filters_levelone = sum + dataset_lists left, center, _, middle, right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2]) with left: filter_1 = st.selectbox('Dataset', filters_levelone) if filter_1: if filter_1 in sum: sum_table_mulit_metrix('AQA', ['llama3_70b_judge']) else: dataset_contents(asqa_datasets[filter_1], metrics['llama3_70b_judge']) draw('asu', 'AQA', filter_1, 'llama3_70b_judge') def er(): st.title("Task: Emotion Recognition") sum = ['Overall'] dataset_lists = ['IEMOCAP-Emotion-Test', 'MELD-Sentiment-Test', 'MELD-Emotion-Test'] filters_levelone = sum + dataset_lists left, center, _, middle, right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2]) with left: filter_1 = st.selectbox('Dataset', filters_levelone) if filter_1: if filter_1 in sum: sum_table_mulit_metrix('ER', ['llama3_70b_judge_binary']) else: dataset_contents(er_datasets[filter_1], metrics['llama3_70b_judge_binary']) draw('vu', 'ER', filter_1, 'llama3_70b_judge_binary') def ar(): st.title("Task: Accent Recognition") sum = ['Overall'] dataset_lists = ['VoxCeleb-Accent-Test'] filters_levelone = sum + dataset_lists left, center, _, middle, right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2]) with left: filter_1 = st.selectbox('Dataset', filters_levelone) if filter_1: if filter_1 in sum: sum_table_mulit_metrix('AR', ['llama3_70b_judge']) # sum_table('aR', 'llama3_70b_judge') else: dataset_contents(ar_datsets[filter_1], metrics['llama3_70b_judge']) draw('vu', 'AR', filter_1, 'llama3_70b_judge') def gr(): st.title("Task: Gender Recognition") sum = ['Overall'] dataset_lists = ['VoxCeleb-Gender-Test', 'IEMOCAP-Gender-Test'] filters_levelone = sum + dataset_lists left, center, _, middle, right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2]) with left: filter_1 = st.selectbox('Dataset', filters_levelone) if filter_1: if filter_1 in sum: sum_table_mulit_metrix('GR', ['llama3_70b_judge_binary']) else: dataset_contents(gr_datasets[filter_1], metrics['llama3_70b_judge_binary']) draw('vu', 'GR', filter_1, 'llama3_70b_judge_binary') def spt(): st.title("Task: Speech Translation") sum = ['Overall'] dataset_lists = [ 'Covost2-EN-ID-test', 'Covost2-EN-ZH-test', 'Covost2-EN-TA-test', 'Covost2-ID-EN-test', 'Covost2-ZH-EN-test', 'Covost2-TA-EN-test'] filters_levelone = sum + dataset_lists left, center, _, middle, right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2]) with left: filter_1 = st.selectbox('Dataset', filters_levelone) if filter_1: if filter_1 in sum: sum_table_mulit_metrix('ST', ['bleu']) else: dataset_contents(spt_datasets[filter_1], metrics['bleu']) draw('su', 'ST', filter_1, 'bleu')