Spaces:
Running
Running
Sasha
Initial version of the Evaluation Buddy -- currently most things are hardcoded (e.g. the dataset list), but the goal it to make it all compatible with the Hub!
1f09890
import streamlit as st | |
from datasets import load_dataset_builder | |
from datasets import get_dataset_config_names | |
from os import listdir | |
from datasets import load_dataset, Dataset | |
from datasets_sql import query | |
import plotly.express as px | |
import numpy as np | |
import statistics | |
st.set_page_config( | |
page_title="Evaluation Buddy", | |
page_icon="./robot.png", | |
layout="wide", | |
) | |
st.title("Hugging Face Evaluation Buddy") | |
top_datasets= ['glue', 'super_glue', 'wikitext', 'imdb', 'squad', 'squad_es', \ | |
'paws', 'librispeech_asr', 'wmt16', 'xnli', 'snli', 'ag_news', \ | |
'anli', 'amazon_polarity', 'squad_v2', 'conll2003', 'red_caps', \ | |
'common_voice', 'stsb_multi_mt', 'trec', 'tweet_eval', 'cosmos_qa',\ | |
'sick', 'xsum', 'wikiann', 'yelp_polarity', 'hellaswag', 'piqa', \ | |
'race', 'winogrande'] | |
tasks= ['text-classification', 'question-answering-extractive', 'automatic-speech-recognition'] | |
with st.sidebar.expander("Datasets", expanded=True): | |
dataset_name = st.selectbox( | |
f"Choose a dataset to evaluate on:", | |
sorted(top_datasets)) | |
configs = get_dataset_config_names(dataset_name) | |
dataset_config = st.selectbox( | |
f"Choose a configuration of your dataset:", | |
configs) | |
dataset_builder = load_dataset_builder(dataset_name, dataset_config) | |
splits = [s for s in dataset_builder.info.splits] | |
dataset_split = st.selectbox( | |
f"Choose a dataset split:", | |
splits) | |
balanced_stdev = st.slider("Choose a standard deviation threshold for determining whether a dataset is balanced or not:", 0.00, 1.00, 0.20) | |
st.markdown("## Here is some information about your dataset:") | |
st.markdown("### Description") | |
st.markdown(dataset_builder.info.description) | |
st.markdown("For more information about this dataset, check out [its website](https://huggingface.co/datasets/"+dataset_name+")") | |
st.markdown("### Dataset-Specific Metrics") | |
if dataset_name in listdir('../datasets/metrics/'): | |
st.markdown("Great news! Your dataset has a dedicated metric for it! You can use it like this:") | |
code = ''' from datasets import load_metric | |
metric = load_metric('''+dataset+''', '''+config+''')''' | |
st.code(code, language='python') | |
dedicated_metric = True | |
else: | |
st.markdown("Your dataset doesn't have a dedicated metric, but that's ok!") | |
dedicated_metric = False | |
st.markdown("### Task-Specific Metrics") | |
try: | |
task = dataset_builder.info.task_templates[0].task | |
st.markdown("The task associated to it is: " + task) | |
if task == 'automatic-speech-recognition': | |
st.markdown('Automatic Speech Recognition has some dedicated metrics such as:') | |
st.markdown('[Word Error Rate](https://huggingface.co/metrics/wer)') | |
st.markdown('[Character Error Rate](https://huggingface.co/metrics/cer)') | |
else: | |
st.markdown("The task for your dataset doesn't have any dedicated metrics, but you can still use general ones!") | |
except: | |
st.markdown("The task for your dataset doesn't have any dedicated metrics, but you can still use general ones!") | |
#print(dataset_builder.info.task_templates) | |
#print(dataset_builder.info.features) | |
#st.markdown("### General Metrics") | |
#dataset = load_dataset(dataset_name, dataset_config, dataset_split) | |
#print(dataset_name, dataset_config, dataset_split) | |
#print(labels.head()) | |
try: | |
num_classes = dataset_builder.info.features['label'].num_classes | |
dataset = load_dataset(dataset_name, split=dataset_split) | |
labels = query("SELECT COUNT(*) from dataset GROUP BY label").to_pandas() | |
labels = labels.rename(columns={"count_star()": "count"}) | |
labels.index = dataset_builder.info.features['label'].names | |
st.markdown("### Labelled Metrics") | |
st.markdown("Your dataset has "+ str(dataset_builder.info.features['label'].num_classes) + " labels : " + ', '.join(dataset_builder.info.features['label'].names)) | |
#TODO : figure out how to make a label plot | |
st.plotly_chart(px.pie(labels, values = "count", names = labels.index, width=800, height=400)) | |
total = sum(c for c in labels['count']) | |
proportion = [c/total for c in labels['count']] | |
#proportion = [0.85, 0.15] | |
stdev_dataset= statistics.stdev(proportion) | |
if stdev_dataset <= balanced_stdev: | |
st.markdown("Since your dataset is well-balanced, you can look at using:") | |
st.markdown('[Accuracy](https://huggingface.co/metrics/accuracy)') | |
accuracy_code = '''from datasets import load_metric | |
metric = load_metric("accuracy")''' | |
st.code(accuracy_code, language='python') | |
else: | |
st.markdown("Since your dataset is not well-balanced, you can look at using:") | |
st.markdown('[F1 Score](https://huggingface.co/metrics/f1)') | |
accuracy_code = '''from datasets import load_metric | |
metric = load_metric("accuracy")''' | |
st.code(accuracy_code, language='python') | |
st.markdown('Since it takes into account both precision and recall, which works well to evaluate model performance on minority classes.') | |
except: | |
st.markdown("### Unsupervised Metrics") | |
st.markdown("Since dataset doesn't have any labels, so the metrics that you can use for evaluation are:") | |
st.markdown('[Perplexity](https://huggingface.co/metrics/perplexity)') | |
perplexity_code = '''from datasets import load_metric | |
metric = load_metric("perplexity")''' | |
st.code(perplexity_code, language='python') | |
st.markdown('If you choose a model that was trained on **' + dataset_name + '** and use it to compute perplexity on text generated by your model, this can help determine how similar the two are.') | |