import streamlit as st from datasets import load_dataset_builder from datasets import get_dataset_config_names from os import listdir from datasets import load_dataset, Dataset from datasets_sql import query import plotly.express as px import numpy as np import statistics st.set_page_config( page_title="HuggingFace Evaluation Buddy", page_icon="./robot.png", layout="wide", ) st.title("Hugging Face Evaluation Buddy") st.markdown('### Here to help you do mindful model evaluation') top_datasets= ['glue', 'super_glue', 'wikitext', 'imdb', 'squad', 'squad_es', \ 'paws', 'librispeech_asr', 'wmt16', 'xnli', 'snli', 'ag_news', \ 'anli', 'amazon_polarity', 'squad_v2', 'conll2003', 'red_caps', \ 'common_voice', 'stsb_multi_mt', 'trec', 'tweet_eval', 'cosmos_qa',\ 'sick', 'xsum', 'wikiann', 'yelp_polarity', 'hellaswag', 'piqa', \ 'race', 'winogrande'] tasks= ['classification', 'question answering', 'automatic speech recognition', 'natural language inference', \ 'translation', 'sentiment analysis', 'text simplification', 'named entity recognition', \ 'reading comprehension', 'paraphrase identification', 'natural language understanding',\ 'textual entailment', 'commonsense reasoning', 'summarization'] metrics= ['matthews_correlation', 'perplexity', 'meteor', 'code_eval', 'super_glue', 'rouge', 'mauve', 'cer', 'accuracy', 'recall', 'bleurt', 'sari', 'precision', 'mean_iou', 'squad', 'mahalanobis', 'chrf', 'mae', 'squad_v2', 'seqeval', 'cuad', 'wiki_split', 'google_bleu', 'competition_math', 'pearsonr', 'xtreme_s', 'comet', 'gleu', 'spearmanr', 'f1', 'frugalscore', 'bertscore', 'indic_glue', 'mse', 'xnli', 'ter', 'coval', 'wer', 'bleu', 'glue', 'sacrebleu'] dedicated_metric = False def find_task(dname): task = None dataset_builder = load_dataset_builder(dataset_name, dataset_config) try: task = dataset_builder.info.task_templates[0].task except: for t in tasks: if t in str(dataset_builder.info.description).lower(): task = t else: continue return(task) with st.sidebar.expander("Datasets", expanded=True): dataset_name = st.selectbox( f"Choose a dataset to evaluate on:", sorted(top_datasets)) configs = get_dataset_config_names(dataset_name) dataset_config = st.selectbox( f"Choose a configuration of the dataset:", configs) dataset_builder = load_dataset_builder(dataset_name, dataset_config) splits = [s for s in dataset_builder.info.splits] dataset_split = st.selectbox( f"Choose a dataset split:", splits) balanced_stdev = st.slider("Choose a standard deviation threshold for determining whether a dataset is balanced or not:", 0.00, 1.00, 0.20) st.markdown("## Here is some information about this dataset:") st.markdown(dataset_builder.info.description) if len(dataset_builder.info.description) == 1: st.markdown("This dataset does not have a description. :no_mouth:") st.markdown("For more information about this dataset, check out [its website](https://huggingface.co/datasets/"+dataset_name+")") st.markdown("## Now let's see what metrics we can use to evaluate models on this dataset!") st.markdown("### Dataset-Specific Metrics") if dataset_name in metrics: st.markdown("Great news! This dataset has a dedicated metric for it!:partying_face: You can use it like this: :point_down:") if "glue" in dataset_name: code = ''' from datasets import load_metric metric = load_metric(\"'''+dataset_name+'''\", \"'''+dataset_config+'''\")''' st.code(code, language='python') else: code = ''' from datasets import load_metric metric = load_metric(\"'''+dataset_name+'''\")''' st.code(code, language='python') dedicated_metric = True else: st.markdown("This dataset doesn't have a dedicated metric, but that's ok! :wink:") dedicated_metric = False if dedicated_metric == False: st.markdown("### Task-Specific Metrics") task = find_task(dataset_name) if task is not None: st.markdown("The task associated to it this dataset is: " + task.replace('-',' ')) if task == 'automatic-speech-recognition': st.markdown('Automatic Speech Recognition has some dedicated metrics such as:') st.markdown('[Word Error Rate](https://huggingface.co/metrics/wer)') wer_code = '''from datasets import load_metric metric = load_metric("wer")''' st.code(wer_code, language='python') st.markdown('[Character Error Rate](https://huggingface.co/metrics/cer)') cer_code = '''from datasets import load_metric metric = load_metric("cer")''' st.code(cer_code, language='python') else: st.markdown("The task for this dataset doesn't have any dedicated metrics, but you can still use general ones! :cowboy_hat_face:") #print(dataset_builder.info.task_templates) #print(dataset_builder.info.features) #st.markdown("### General Metrics") #dataset = load_dataset(dataset_name, dataset_config, dataset_split) #print(dataset_name, dataset_config, dataset_split) #print(labels.head()) if dedicated_metric == False: if dataset_name in ['glue','super_glue', 'paws', 'squad_es']: dataset = load_dataset(dataset_name, dataset_config, split=dataset_split) else: dataset = load_dataset(dataset_name, split=dataset_split) try: num_classes = dataset_builder.info.features['label'].num_classes labels = query("SELECT COUNT(*) from dataset GROUP BY label").to_pandas() labels = labels.rename(columns={"count_star()": "count"}) labels.index = dataset_builder.info.features['label'].names st.markdown("### Labelled Metrics") st.markdown("This dataset has "+ str(dataset_builder.info.features['label'].num_classes) + " labels : " + ', '.join(dataset_builder.info.features['label'].names)) #TODO : figure out how to make a label plot st.plotly_chart(px.pie(labels, values = "count", names = labels.index, width=800, height=400)) total = sum(c for c in labels['count']) proportion = [c/total for c in labels['count']] #proportion = [0.85, 0.15] stdev_dataset= statistics.stdev(proportion) if stdev_dataset <= balanced_stdev: st.markdown("Since this dataset is well-balanced (with a standard deviation of " + str(round(stdev_dataset,2)) +"), you can look at using:") st.markdown('[Accuracy](https://huggingface.co/metrics/accuracy)') accuracy_code = '''from datasets import load_metric metric = load_metric("accuracy")''' st.code(accuracy_code, language='python') else: st.markdown("Since this dataset is not well-balanced (with a standard deviation of " + str(round(stdev_dataset,2)) +"), you can look at using:") st.markdown('[F1 Score](https://huggingface.co/metrics/f1)') accuracy_code = '''from datasets import load_metric metric = load_metric("accuracy")''' st.code(accuracy_code, language='python') st.markdown('Since it takes into account both precision and recall, which works well to evaluate model performance on minority classes.') except: if task != 'automatic-speech-recognition': st.markdown("### Unsupervised Metrics") st.markdown("Since this dataset doesn't have any labels, the metrics that you can use for evaluation are:") st.markdown('[Perplexity](https://huggingface.co/metrics/perplexity)') perplexity_code = '''from datasets import load_metric metric = load_metric("perplexity")''' st.code(perplexity_code, language='python') st.markdown('If you choose a model that was trained on **' + dataset_name + '** and use it to compute perplexity on text generated by your model, this can help determine how similar the two are.')