|
from fixed_f1 import FixedF1 |
|
from fixed_precision import FixedPrecision |
|
from fixed_recall import FixedRecall |
|
import evaluate |
|
import gradio as gr |
|
import pandas as pd |
|
import numpy as np |
|
|
|
title = "'Combine' multiple metrics with this 🤗 Evaluate 🪲 Fix!" |
|
|
|
description = """<p style='text-align: center'> |
|
As I introduce myself to the entirety of the 🤗 ecosystem, I've put together this Space to show off a temporary fix for a current 🪲 in the 🤗 Evaluate library. \n |
|
|
|
Check out the original, longstanding issue [here](https://github.com/huggingface/evaluate/issues/234). This details how it is currently impossible to \ |
|
`evaluate.combine()` multiple metrics related to multilabel text classification. Particularly, one cannot `combine` the `f1`, `precision`, and `recall` scores for \ |
|
evaluation. I encountered this issue specifically while training [RoBERTa-base-DReiFT](https://huggingface.co/MarioBarbeque/RoBERTa-base-DReiFT) for multilabel \ |
|
text classification of 805 labeled medical conditions based on drug reviews. The [following workaround](https://github.com/johngrahamreynolds/FixedMetricsForHF) was |
|
created to address this - follow the link to view the source! To see each of these abstracted classes at work independently, view the 🤗 Space I've constructed for each: |
|
[FixedF1📈](https://huggingface.co/spaces/MarioBarbeque/FixedF1), [FixedPrecision🎯](https://huggingface.co/spaces/MarioBarbeque/FixedPrecision), |
|
[FixedRecall📉](https://huggingface.co/spaces/MarioBarbeque/FixedRecall).\n |
|
|
|
This Space shows how one can instantiate these custom `evaluate.Metric`s, each with their own unique methodology for averaging across labels, before `combine`-ing them into a |
|
HF `evaluate.CombinedEvaluations` object. From here, we can easily compute each of the metrics simultaneously using `compute`. \n |
|
|
|
In general, one writes the following:\n |
|
|
|
```python |
|
f1 = FixedF1(average=...) |
|
precision = FixedPrecision(average=...) |
|
recall = FixedRecall(average=...) |
|
|
|
combined = evaluate.combine([f1, precision, recall]) |
|
|
|
combined.add_batch(predictions=..., references=...) |
|
combined.compute() |
|
```\n |
|
|
|
where the `average` parameter can be different at instantiation time for each of the metrics. Acceptable values include `[None, 'micro', 'macro', 'weighted']` ( |
|
or `binary` if there exist only two labels). \n |
|
|
|
Try it out using the examples below! Then try picking some various averaging methods yourself! |
|
</p> |
|
""" |
|
|
|
|
|
def populate_map(metric_df: pd.DataFrame, metric_set: set) -> dict: |
|
|
|
metric_map = dict() |
|
|
|
for key in metric_set: |
|
for val in metric_df.loc[metric_df["Metric"] == key]["Averaging Method"]: |
|
metric_map[key] = val |
|
|
|
return metric_map |
|
|
|
|
|
def evaluation(predictions_df: pd.DataFrame, metrics_df: pd.DataFrame) -> str: |
|
|
|
metric_set = set(metrics_df["Metric"].to_list()) |
|
metric_map = populate_map(metrics_df, metric_set) |
|
combined_list = [] |
|
|
|
if "f1" in metric_set: |
|
f1 = FixedF1(average=metric_map["f1"] if metric_map["f1"] != "None" else None) |
|
combined_list.append(f1) |
|
if "precision" in metric_set: |
|
precision = FixedPrecision(average=metric_map["precision"] if metric_map["precision"] != "None" else None, zero_division=np.nan) |
|
combined_list.append(precision) |
|
if "recall" in metric_set: |
|
recall = FixedRecall(average=metric_map["recall"] if metric_map["recall"] != "None" else None) |
|
combined_list.append(recall) |
|
|
|
combined = evaluate.combine(combined_list) |
|
|
|
predicted = [int(num) for num in predictions_df["Predicted Class Label"].to_list()] |
|
references = [int(num) for num in predictions_df["Actual Class Label"].to_list()] |
|
|
|
combined.add_batch(predictions=predicted, references=references) |
|
outputs = combined.compute() |
|
|
|
return f"Your metrics are as follows: \n {outputs}" |
|
|
|
|
|
space = gr.Interface( |
|
fn=evaluation, |
|
inputs=[ |
|
gr.Dataframe( |
|
headers=["Predicted Class Label", "Actual Class Label"], |
|
datatype=["number", "number"], |
|
row_count=5, |
|
col_count=(2, "fixed"), |
|
label="Table of Predicted vs Actual Class Labels" |
|
), |
|
gr.Dataframe( |
|
headers=["Metric", "Averaging Method"], |
|
datatype=["str", "str"], |
|
row_count=(3, "fixed"), |
|
col_count=(2, "fixed"), |
|
label="Table of Metrics and Averaging Method across Labels " |
|
) |
|
], |
|
outputs="text", |
|
title=title, |
|
description=description, |
|
examples=[ |
|
[ |
|
pd.DataFrame(columns=["Predicted Class Label", "Actual Class Label"], data=[[0,1],[1,1],[2,2],[1,0],[0,0]]), |
|
pd.DataFrame(columns=["Metric", "Averaging Method"], data=[["f1", "weighted"],["precision", "micro"],["recall", "macro"]]) |
|
] |
|
], |
|
cache_examples=False |
|
).launch() |