model-evaluator / evaluation.py
lewtun's picture
lewtun HF staff
Catch metadata exceptions in autoevaluate datasets
2f16764
raw
history blame contribute delete
No virus
2.05 kB
import copy
from dataclasses import dataclass
import streamlit as st
from huggingface_hub import DatasetFilter, HfApi
from huggingface_hub.hf_api import DatasetInfo
@dataclass(frozen=True, eq=True)
class EvaluationInfo:
task: str
model: str
dataset_name: str
dataset_config: str
dataset_split: str
metrics: set
def create_evaluation_info(dataset_info: DatasetInfo) -> int:
if dataset_info.cardData is not None:
metadata = dataset_info.cardData["eval_info"]
metadata.pop("col_mapping", None)
# TODO(lewtun): populate dataset cards with metric info
if "metrics" not in metadata:
metadata["metrics"] = frozenset()
else:
metadata["metrics"] = frozenset(metadata["metrics"])
return EvaluationInfo(**metadata)
def get_evaluation_infos():
evaluation_datasets = []
filt = DatasetFilter(author="autoevaluate")
autoevaluate_datasets = HfApi().list_datasets(filter=filt, full=True)
for dset in autoevaluate_datasets:
try:
evaluation_datasets.append(create_evaluation_info(dset))
except Exception as e:
print(f"Error processing dataset {dset}: {e}")
return evaluation_datasets
def filter_evaluated_models(models, task, dataset_name, dataset_config, dataset_split, metrics):
evaluation_infos = get_evaluation_infos()
models_to_filter = copy.copy(models)
for model in models_to_filter:
evaluation_info = EvaluationInfo(
task=task,
model=model,
dataset_name=dataset_name,
dataset_config=dataset_config,
dataset_split=dataset_split,
metrics=frozenset(metrics),
)
if evaluation_info in evaluation_infos:
st.info(
f"Model [`{model}`](https://huggingface.co/{model}) has already been evaluated on this configuration. \
This model will be excluded from the evaluation job..."
)
models.remove(model)
return models