Spaces:
Running
Running
import streamlit as st | |
from app.draw_diagram import * | |
from app.content import * | |
from app.summarization import * | |
def dataset_contents(dataset, metrics): | |
custom_css = """ | |
<style> | |
.my-dataset-info { | |
# background-color: #F9EBEA; | |
# padding: 10px; | |
color: #050505; | |
font-style: normal; | |
font-size: 8px; | |
height: auto; | |
} | |
</style> | |
""" | |
st.markdown(custom_css, unsafe_allow_html=True) | |
st.markdown(f"""<div class="my-dataset-info"> | |
<p><b>About this dataset</b>: {dataset}</p> | |
</div>""", unsafe_allow_html=True) | |
st.markdown(f"""<div class="my-dataset-info"> | |
<p><b>About this metric</b>: {metrics}</p> | |
</div>""", unsafe_allow_html=True) | |
def dashboard(): | |
with st.container(): | |
st.title("AudioBench") | |
st.markdown(""" | |
[gh]: https://github.com/AudioLLMs/AudioBench | |
[![GitHub Repo stars](https://img.shields.io/github/stars/AudioLLMs/AudioBench?style=social)][gh] | |
[![GitHub watchers](https://img.shields.io/github/watchers/AudioLLMs/AudioBench?style=social)][gh] | |
""") | |
audio_url = "https://arxiv.org/abs/2406.16020" | |
st.markdown("#### News") | |
st.markdown("**Dec, 2024**: Update layout and support comparison between models with similar model sizes. Layout reorganized for better user experience. Add performance summary for each task.") | |
st.markdown("**Sep, 2024**: Initial leaderboard online.") | |
st.divider() | |
st.markdown("#### What is [AudioBench](%s)?" % audio_url) | |
st.markdown("##### :dizzy: A comprehensive evaluation benchmark designed for general instruction-following audio large language models.") | |
st.markdown("##### :dizzy: A evaluation benchmark that we consistently put effort in updating and maintaining.") | |
st.markdown(''' | |
''') | |
with st.container(): | |
left_co, center_co, right_co = st.columns([0.5,1, 0.5]) | |
with center_co: | |
st.image("./style/audio_overview.png", | |
caption="Overview of the datasets in AudioBench.", | |
# use_container_width = True | |
) | |
st.markdown(''' | |
''') | |
st.markdown("###### :dart: Our Benchmark includes: ") | |
cols = st.columns(10) | |
cols[1].metric(label="Tasks", value=">8") #delta="Tasks", delta_color="off" | |
cols[2].metric(label="Datasets", value=">30") | |
cols[3].metric(label="Evaluated Models", value=">5") | |
st.divider() | |
with st.container(): | |
st.markdown("##### Citations") | |
st.markdown(''' | |
:round_pushpin: AudioBench Paper \n | |
@article{wang2024audiobench, | |
title={AudioBench: A Universal Benchmark for Audio Large Language Models}, | |
author={Wang, Bin and Zou, Xunlong and Lin, Geyu and Sun, Shuo and Liu, Zhuohan and Zhang, Wenyu and Liu, Zhengyuan and Aw, AiTi and Chen, Nancy F}, | |
journal={arXiv preprint arXiv:2406.16020}, | |
year={2024} | |
} | |
''') | |
def asr(): | |
st.title("Task: Automatic Speech Recognition") | |
sum = ['Overall'] | |
dataset_lists = [ | |
'LibriSpeech-Test-Clean', | |
'LibriSpeech-Test-Other', | |
'Common-Voice-15-En-Test', | |
'Peoples-Speech-Test', | |
'GigaSpeech-Test', | |
'Earnings21-Test', | |
'Earnings22-Test', | |
'Tedlium3-Test', | |
'Tedlium3-Long-form-Test', | |
] | |
filters_levelone = sum + dataset_lists | |
left, center, _, middle, right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2]) | |
with left: | |
filter_1 = st.selectbox('Dataset', filters_levelone) | |
if filter_1: | |
if filter_1 in sum: | |
sum_table_mulit_metrix('ASR', ['wer']) | |
else: | |
dataset_contents(asr_datsets[filter_1], metrics['wer']) | |
draw('su', 'ASR', filter_1, 'wer', cus_sort=True) | |
def cnasr(): | |
st.title("Task: Automatic Speech Recognition - Mandarin") | |
sum = ['Overall'] | |
dataset_lists = [ | |
'Aishell-ASR-ZH-Test', | |
] | |
filters_levelone = sum + dataset_lists | |
left, center, _, middle, right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2]) | |
with left: | |
filter_1 = st.selectbox('Dataset', filters_levelone) | |
if filter_1: | |
if filter_1 in sum: | |
sum_table_mulit_metrix('CNASR', ['wer']) | |
else: | |
dataset_contents(cnasr_datasets[filter_1], metrics['wer']) | |
draw('su', 'CNASR', filter_1, 'wer') | |
def sqa(): | |
st.title("Task: Speech Question Answering") | |
sum = ['Overall'] | |
binary = ['CN-College-Listen-MCQ-Test', 'DREAM-TTS-MCQ-Test'] | |
rest = ['SLUE-P2-SQA5-Test', | |
'Public-SG-Speech-QA-Test', | |
'Spoken-Squad-Test'] | |
filters_levelone = sum + binary + rest | |
left, center, _, middle, right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2]) | |
with left: | |
filter_1 = st.selectbox('Dataset', filters_levelone) | |
if filter_1: | |
if filter_1 in sum: | |
sum_table_mulit_metrix('SQA', ['llama3_70b_judge_binary', 'llama3_70b_judge']) | |
elif filter_1 in binary: | |
dataset_contents(sqa_datasets[filter_1], metrics['llama3_70b_judge_binary']) | |
draw('su', 'SQA', filter_1, 'llama3_70b_judge_binary') | |
else: | |
dataset_contents(sqa_datasets[filter_1], metrics['llama3_70b_judge']) | |
draw('su', 'SQA', filter_1, 'llama3_70b_judge') | |
def si(): | |
st.title("Task: Speech Instruction") | |
sum = ['Overall'] | |
dataset_lists = ['OpenHermes-Audio-Test', | |
'ALPACA-Audio-Test'] | |
filters_levelone = sum + dataset_lists | |
left, center, _, middle, right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2]) | |
with left: | |
filter_1 = st.selectbox('Dataset', filters_levelone) | |
if filter_1: | |
if filter_1 in sum: | |
sum_table_mulit_metrix('SI', ['llama3_70b_judge']) | |
else: | |
dataset_contents(si_datasets[filter_1], metrics['llama3_70b_judge']) | |
draw('su', 'SI', filter_1, 'llama3_70b_judge') | |
def ac(): | |
st.title("Task: Audio Captioning") | |
filters_levelone = ['WavCaps-Test', | |
'AudioCaps-Test'] | |
filters_leveltwo = ['Llama3-70b-judge', 'Meteor'] | |
left, center, _, middle, right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2]) | |
with left: | |
filter_1 = st.selectbox('Dataset', filters_levelone) | |
with middle: | |
metric = st.selectbox('Metric', filters_leveltwo) | |
if filter_1 or metric: | |
dataset_contents(ac_datasets[filter_1], metrics[metric.lower().replace('-', '_')]) | |
draw('asu', 'AC',filter_1, metric.lower().replace('-', '_')) | |
def asqa(): | |
st.title("Task: Audio Scene Question Answering") | |
sum = ['Overall'] | |
dataset_lists = ['Clotho-AQA-Test', | |
'WavCaps-QA-Test', | |
'AudioCaps-QA-Test'] | |
filters_levelone = sum + dataset_lists | |
left, center, _, middle, right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2]) | |
with left: | |
filter_1 = st.selectbox('Dataset', filters_levelone) | |
if filter_1: | |
if filter_1 in sum: | |
sum_table_mulit_metrix('AQA', ['llama3_70b_judge']) | |
else: | |
dataset_contents(asqa_datasets[filter_1], metrics['llama3_70b_judge']) | |
draw('asu', 'AQA', filter_1, 'llama3_70b_judge') | |
def er(): | |
st.title("Task: Emotion Recognition") | |
sum = ['Overall'] | |
dataset_lists = ['IEMOCAP-Emotion-Test', | |
'MELD-Sentiment-Test', | |
'MELD-Emotion-Test'] | |
filters_levelone = sum + dataset_lists | |
left, center, _, middle, right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2]) | |
with left: | |
filter_1 = st.selectbox('Dataset', filters_levelone) | |
if filter_1: | |
if filter_1 in sum: | |
sum_table_mulit_metrix('ER', ['llama3_70b_judge_binary']) | |
else: | |
dataset_contents(er_datasets[filter_1], metrics['llama3_70b_judge_binary']) | |
draw('vu', 'ER', filter_1, 'llama3_70b_judge_binary') | |
def ar(): | |
st.title("Task: Accent Recognition") | |
sum = ['Overall'] | |
dataset_lists = ['VoxCeleb-Accent-Test'] | |
filters_levelone = sum + dataset_lists | |
left, center, _, middle, right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2]) | |
with left: | |
filter_1 = st.selectbox('Dataset', filters_levelone) | |
if filter_1: | |
if filter_1 in sum: | |
sum_table_mulit_metrix('AR', ['llama3_70b_judge']) | |
# sum_table('aR', 'llama3_70b_judge') | |
else: | |
dataset_contents(ar_datsets[filter_1], metrics['llama3_70b_judge']) | |
draw('vu', 'AR', filter_1, 'llama3_70b_judge') | |
def gr(): | |
st.title("Task: Gender Recognition") | |
sum = ['Overall'] | |
dataset_lists = ['VoxCeleb-Gender-Test', | |
'IEMOCAP-Gender-Test'] | |
filters_levelone = sum + dataset_lists | |
left, center, _, middle, right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2]) | |
with left: | |
filter_1 = st.selectbox('Dataset', filters_levelone) | |
if filter_1: | |
if filter_1 in sum: | |
sum_table_mulit_metrix('GR', ['llama3_70b_judge_binary']) | |
else: | |
dataset_contents(gr_datasets[filter_1], metrics['llama3_70b_judge_binary']) | |
draw('vu', 'GR', filter_1, 'llama3_70b_judge_binary') | |
def spt(): | |
st.title("Task: Speech Translation") | |
sum = ['Overall'] | |
dataset_lists = [ | |
'Covost2-EN-ID-test', | |
'Covost2-EN-ZH-test', | |
'Covost2-EN-TA-test', | |
'Covost2-ID-EN-test', | |
'Covost2-ZH-EN-test', | |
'Covost2-TA-EN-test'] | |
filters_levelone = sum + dataset_lists | |
left, center, _, middle, right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2]) | |
with left: | |
filter_1 = st.selectbox('Dataset', filters_levelone) | |
if filter_1: | |
if filter_1 in sum: | |
sum_table_mulit_metrix('ST', ['bleu']) | |
else: | |
dataset_contents(spt_datasets[filter_1], metrics['bleu']) | |
draw('su', 'ST', filter_1, 'bleu') | |