Spaces:
Running
Running
import os | |
import uuid | |
from pathlib import Path | |
import pandas as pd | |
import streamlit as st | |
from datasets import get_dataset_config_names | |
from dotenv import load_dotenv | |
from huggingface_hub import list_datasets | |
from evaluation import filter_evaluated_models | |
from utils import ( | |
commit_evaluation_log, | |
format_col_mapping, | |
get_compatible_models, | |
get_key, | |
get_metadata, | |
http_get, | |
http_post, | |
) | |
if Path(".env").is_file(): | |
load_dotenv(".env") | |
HF_TOKEN = os.getenv("HF_TOKEN") | |
AUTOTRAIN_USERNAME = os.getenv("AUTOTRAIN_USERNAME") | |
AUTOTRAIN_BACKEND_API = os.getenv("AUTOTRAIN_BACKEND_API") | |
DATASETS_PREVIEW_API = os.getenv("DATASETS_PREVIEW_API") | |
TASK_TO_ID = { | |
"binary_classification": 1, | |
"multi_class_classification": 2, | |
# "multi_label_classification": 3, # Not fully supported in AutoTrain | |
"entity_extraction": 4, | |
"extractive_question_answering": 5, | |
"translation": 6, | |
"summarization": 8, | |
} | |
TASK_TO_DEFAULT_METRICS = { | |
"binary_classification": ["f1", "precision", "recall", "auc", "accuracy"], | |
"multi_class_classification": [ | |
"f1", | |
"precision", | |
"recall", | |
"accuracy", | |
], | |
"entity_extraction": ["precision", "recall", "f1", "accuracy"], | |
"extractive_question_answering": [], | |
"translation": ["sacrebleu"], | |
"summarization": ["rouge1", "rouge2", "rougeL", "rougeLsum"], | |
} | |
SUPPORTED_TASKS = list(TASK_TO_ID.keys()) | |
# Extracted from utils.get_supported_metrics | |
# Hardcoded for now due to speed / caching constraints | |
SUPPORTED_METRICS = [ | |
"accuracy", | |
"bertscore", | |
"bleu", | |
"cer", | |
"chrf", | |
"code_eval", | |
"comet", | |
"competition_math", | |
"coval", | |
"cuad", | |
"exact_match", | |
"f1", | |
"frugalscore", | |
"google_bleu", | |
"mae", | |
"mahalanobis", | |
"matthews_correlation", | |
"mean_iou", | |
"meteor", | |
"mse", | |
"pearsonr", | |
"perplexity", | |
"precision", | |
"recall", | |
"roc_auc", | |
"rouge", | |
"sacrebleu", | |
"sari", | |
"seqeval", | |
"spearmanr", | |
"squad", | |
"squad_v2", | |
"ter", | |
"trec_eval", | |
"wer", | |
"wiki_split", | |
"xnli", | |
"angelina-wang/directional_bias_amplification", | |
"jordyvl/ece", | |
"lvwerra/ai4code", | |
"lvwerra/amex", | |
"lvwerra/test", | |
"lvwerra/test_metric", | |
] | |
####### | |
# APP # | |
####### | |
st.title("Evaluation on the Hub") | |
st.markdown( | |
""" | |
Welcome to Hugging Face's automatic model evaluator π! | |
This application allows you to evaluate π€ Transformers | |
[models](https://huggingface.co/models?library=transformers&sort=downloads) | |
across a wide variety of [datasets](https://huggingface.co/datasets) on the | |
Hub. Please select the dataset and configuration below. The results of your | |
evaluation will be displayed on the [public | |
leaderboard](https://huggingface.co/spaces/autoevaluate/leaderboards). | |
""" | |
) | |
all_datasets = [d.id for d in list_datasets()] | |
query_params = st.experimental_get_query_params() | |
default_dataset = all_datasets[0] | |
if "dataset" in query_params: | |
if len(query_params["dataset"]) > 0 and query_params["dataset"][0] in all_datasets: | |
default_dataset = query_params["dataset"][0] | |
selected_dataset = st.selectbox( | |
"Select a dataset", | |
all_datasets, | |
index=all_datasets.index(default_dataset), | |
help="""Datasets with metadata can be evaluated with 1-click. Check out the \ | |
[documentation](https://huggingface.co/docs/hub/datasets-cards) to add \ | |
evaluation metadata to a dataset.""", | |
) | |
st.experimental_set_query_params(**{"dataset": [selected_dataset]}) | |
# Check if selected dataset can be streamed | |
is_valid_dataset = http_get( | |
path="/is-valid", | |
domain=DATASETS_PREVIEW_API, | |
params={"dataset": selected_dataset}, | |
).json() | |
if is_valid_dataset["valid"] is False: | |
st.error( | |
"""The dataset you selected is not currently supported. Open a \ | |
[discussion](https://huggingface.co/spaces/autoevaluate/autoevaluate/discussions) for support.""" | |
) | |
metadata = get_metadata(selected_dataset) | |
print(f"INFO -- Dataset metadata: {metadata}") | |
if metadata is None: | |
st.warning("No evaluation metadata found. Please configure the evaluation job below.") | |
with st.expander("Advanced configuration"): | |
# Select task | |
selected_task = st.selectbox( | |
"Select a task", | |
SUPPORTED_TASKS, | |
index=SUPPORTED_TASKS.index(metadata[0]["task_id"]) if metadata is not None else 0, | |
help="""Don't see your favourite task here? Open a \ | |
[discussion](https://huggingface.co/spaces/autoevaluate/autoevaluate/discussions) to request it!""", | |
) | |
# Select config | |
configs = get_dataset_config_names(selected_dataset) | |
selected_config = st.selectbox( | |
"Select a config", | |
configs, | |
help="""Some datasets contain several sub-datasets, known as _configurations_. \ | |
Select one to evaluate your models on. \ | |
See the [docs](https://huggingface.co/docs/datasets/master/en/load_hub#configurations) for more details. | |
""", | |
) | |
# Select splits | |
splits_resp = http_get( | |
path="/splits", | |
domain=DATASETS_PREVIEW_API, | |
params={"dataset": selected_dataset}, | |
) | |
if splits_resp.status_code == 200: | |
split_names = [] | |
all_splits = splits_resp.json() | |
for split in all_splits["splits"]: | |
if split["config"] == selected_config: | |
split_names.append(split["split"]) | |
if metadata is not None: | |
eval_split = metadata[0]["splits"].get("eval_split", None) | |
else: | |
eval_split = None | |
selected_split = st.selectbox( | |
"Select a split", | |
split_names, | |
index=split_names.index(eval_split) if eval_split is not None else 0, | |
help="Be wary when evaluating models on the `train` split.", | |
) | |
# Select columns | |
rows_resp = http_get( | |
path="/rows", | |
domain=DATASETS_PREVIEW_API, | |
params={ | |
"dataset": selected_dataset, | |
"config": selected_config, | |
"split": selected_split, | |
}, | |
).json() | |
col_names = list(pd.json_normalize(rows_resp["rows"][0]["row"]).columns) | |
st.markdown("**Map your dataset columns**") | |
st.markdown( | |
"""The model evaluator uses a standardised set of column names for the input examples and labels. \ | |
Please define the mapping between your dataset columns (right) and the standardised column names (left).""" | |
) | |
col1, col2 = st.columns(2) | |
# TODO: find a better way to layout these items | |
# TODO: need graceful way of handling dataset <--> task mismatch for datasets with metadata | |
col_mapping = {} | |
if selected_task in ["binary_classification", "multi_class_classification"]: | |
with col1: | |
st.markdown("`text` column") | |
st.text("") | |
st.text("") | |
st.text("") | |
st.text("") | |
st.markdown("`target` column") | |
with col2: | |
text_col = st.selectbox( | |
"This column should contain the text to be classified", | |
col_names, | |
index=col_names.index(get_key(metadata[0]["col_mapping"], "text")) if metadata is not None else 0, | |
) | |
target_col = st.selectbox( | |
"This column should contain the labels associated with the text", | |
col_names, | |
index=col_names.index(get_key(metadata[0]["col_mapping"], "target")) if metadata is not None else 0, | |
) | |
col_mapping[text_col] = "text" | |
col_mapping[target_col] = "target" | |
elif selected_task == "entity_extraction": | |
with col1: | |
st.markdown("`tokens` column") | |
st.text("") | |
st.text("") | |
st.text("") | |
st.text("") | |
st.markdown("`tags` column") | |
with col2: | |
tokens_col = st.selectbox( | |
"This column should contain the array of tokens to be classified", | |
col_names, | |
index=col_names.index(get_key(metadata[0]["col_mapping"], "tokens")) if metadata is not None else 0, | |
) | |
tags_col = st.selectbox( | |
"This column should contain the labels associated with each part of the text", | |
col_names, | |
index=col_names.index(get_key(metadata[0]["col_mapping"], "tags")) if metadata is not None else 0, | |
) | |
col_mapping[tokens_col] = "tokens" | |
col_mapping[tags_col] = "tags" | |
elif selected_task == "translation": | |
with col1: | |
st.markdown("`source` column") | |
st.text("") | |
st.text("") | |
st.text("") | |
st.text("") | |
st.markdown("`target` column") | |
with col2: | |
text_col = st.selectbox( | |
"This column should contain the text to be translated", | |
col_names, | |
index=col_names.index(get_key(metadata[0]["col_mapping"], "source")) if metadata is not None else 0, | |
) | |
target_col = st.selectbox( | |
"This column should contain the target translation", | |
col_names, | |
index=col_names.index(get_key(metadata[0]["col_mapping"], "target")) if metadata is not None else 0, | |
) | |
col_mapping[text_col] = "source" | |
col_mapping[target_col] = "target" | |
elif selected_task == "summarization": | |
with col1: | |
st.markdown("`text` column") | |
st.text("") | |
st.text("") | |
st.text("") | |
st.text("") | |
st.markdown("`target` column") | |
with col2: | |
text_col = st.selectbox( | |
"This column should contain the text to be summarized", | |
col_names, | |
index=col_names.index(get_key(metadata[0]["col_mapping"], "text")) if metadata is not None else 0, | |
) | |
target_col = st.selectbox( | |
"This column should contain the target summary", | |
col_names, | |
index=col_names.index(get_key(metadata[0]["col_mapping"], "target")) if metadata is not None else 0, | |
) | |
col_mapping[text_col] = "text" | |
col_mapping[target_col] = "target" | |
elif selected_task == "extractive_question_answering": | |
if metadata is not None: | |
col_mapping = metadata[0]["col_mapping"] | |
# Hub YAML parser converts periods to hyphens, so we remap them here | |
col_mapping = format_col_mapping(col_mapping) | |
with col1: | |
st.markdown("`context` column") | |
st.text("") | |
st.text("") | |
st.text("") | |
st.text("") | |
st.markdown("`question` column") | |
st.text("") | |
st.text("") | |
st.text("") | |
st.text("") | |
st.markdown("`answers.text` column") | |
st.text("") | |
st.text("") | |
st.text("") | |
st.text("") | |
st.markdown("`answers.answer_start` column") | |
with col2: | |
context_col = st.selectbox( | |
"This column should contain the question's context", | |
col_names, | |
index=col_names.index(get_key(col_mapping, "context")) if metadata is not None else 0, | |
) | |
question_col = st.selectbox( | |
"This column should contain the question to be answered, given the context", | |
col_names, | |
index=col_names.index(get_key(col_mapping, "question")) if metadata is not None else 0, | |
) | |
answers_text_col = st.selectbox( | |
"This column should contain example answers to the question, extracted from the context", | |
col_names, | |
index=col_names.index(get_key(col_mapping, "answers.text")) if metadata is not None else 0, | |
) | |
answers_start_col = st.selectbox( | |
"This column should contain the indices in the context of the first character of each `answers.text`", | |
col_names, | |
index=col_names.index(get_key(col_mapping, "answers.answer_start")) if metadata is not None else 0, | |
) | |
col_mapping[context_col] = "context" | |
col_mapping[question_col] = "question" | |
col_mapping[answers_text_col] = "answers.text" | |
col_mapping[answers_start_col] = "answers.answer_start" | |
# Select metrics | |
st.markdown("**Select metrics**") | |
st.markdown("The following metrics will be computed") | |
html_string = " ".join( | |
[ | |
'<div style="padding-right:5px;padding-left:5px;padding-top:5px;padding-bottom:5px;float:left">' | |
+ '<div style="background-color:#D3D3D3;border-radius:5px;display:inline-block;padding-right:5px;' | |
+ 'padding-left:5px;color:white">' | |
+ metric | |
+ "</div></div>" | |
for metric in TASK_TO_DEFAULT_METRICS[selected_task] | |
] | |
) | |
st.markdown(html_string, unsafe_allow_html=True) | |
selected_metrics = st.multiselect( | |
"(Optional) Select additional metrics", | |
sorted(list(set(SUPPORTED_METRICS) - set(TASK_TO_DEFAULT_METRICS[selected_task]))), | |
help="""User-selected metrics will be computed with their default arguments. \ | |
For example, `f1` will report results for binary labels. \ | |
Check out the [available metrics](https://huggingface.co/metrics) for more details.""", | |
) | |
with st.form(key="form"): | |
compatible_models = get_compatible_models(selected_task, [selected_dataset]) | |
selected_models = st.multiselect( | |
"Select the models you wish to evaluate", | |
compatible_models, | |
help="""Don't see your favourite model in this list? Add the dataset and task it was trained on to the \ | |
[model card metadata.](https://huggingface.co/docs/hub/models-cards#model-card-metadata)""", | |
) | |
print("INFO -- Selected models before filter:", selected_models) | |
if len(selected_models) > 0: | |
selected_models = filter_evaluated_models( | |
selected_models, | |
selected_task, | |
selected_dataset, | |
selected_config, | |
selected_split, | |
) | |
print("INFO -- Selected models after filter:", selected_models) | |
submit_button = st.form_submit_button("Evaluate models π") | |
if submit_button: | |
if len(selected_models) > 0: | |
project_id = str(uuid.uuid4())[:8] | |
project_payload = { | |
"username": AUTOTRAIN_USERNAME, | |
"proj_name": f"eval-project-{project_id}", | |
"task": TASK_TO_ID[selected_task], | |
"config": { | |
"language": "en" | |
if selected_task != "translation" | |
else "en2de", # Need this dummy pair to enable translation | |
"max_models": 5, | |
"instance": { | |
"provider": "aws", | |
"instance_type": "ml.g4dn.4xlarge", | |
"max_runtime_seconds": 172800, | |
"num_instances": 1, | |
"disk_size_gb": 150, | |
}, | |
"evaluation": { | |
"metrics": selected_metrics, | |
"models": selected_models, | |
}, | |
}, | |
} | |
print(f"INFO -- Payload: {project_payload}") | |
project_json_resp = http_post( | |
path="/projects/create", | |
payload=project_payload, | |
token=HF_TOKEN, | |
domain=AUTOTRAIN_BACKEND_API, | |
).json() | |
print(f"INFO -- Project creation response: {project_json_resp}") | |
if project_json_resp["created"]: | |
data_payload = { | |
"split": 4, # use "auto" split choice in AutoTrain | |
"col_mapping": col_mapping, | |
"load_config": {"max_size_bytes": 0, "shuffle": False}, | |
} | |
data_json_resp = http_post( | |
path=f"/projects/{project_json_resp['id']}/data/{selected_dataset}", | |
payload=data_payload, | |
token=HF_TOKEN, | |
domain=AUTOTRAIN_BACKEND_API, | |
params={ | |
"type": "dataset", | |
"config_name": selected_config, | |
"split_name": selected_split, | |
}, | |
).json() | |
print(f"INFO -- Dataset creation response: {data_json_resp}") | |
if data_json_resp["download_status"] == 1: | |
train_json_resp = http_get( | |
path=f"/projects/{project_json_resp['id']}/data/start_process", | |
token=HF_TOKEN, | |
domain=AUTOTRAIN_BACKEND_API, | |
).json() | |
print(f"INFO -- AutoTrain job response: {train_json_resp}") | |
if train_json_resp["success"]: | |
st.success(f"β Successfully submitted evaluation job with project name {project_id}") | |
st.markdown( | |
f""" | |
Evaluation can take up to 1 hour to complete, so grab a β or π΅ while you wait: | |
π Click [here](https://hf.co/spaces/autoevaluate/leaderboards?dataset={selected_dataset}) \ | |
to view the results from your submission | |
""" | |
) | |
print("INFO -- Pushing evaluation job logs to the Hub") | |
evaluation_log = {} | |
evaluation_log["payload"] = project_payload | |
evaluation_log["project_creation_response"] = project_json_resp | |
evaluation_log["dataset_creation_response"] = data_json_resp | |
evaluation_log["autotrain_job_response"] = train_json_resp | |
commit_evaluation_log(evaluation_log, hf_access_token=HF_TOKEN) | |
else: | |
st.error("π Oh no, there was an error submitting your evaluation job!") | |
else: | |
st.warning("β οΈ No models were selected for evaluation!") | |