|
from fixed_f1 import FixedF1 |
|
from fixed_precision import FixedPrecision |
|
from fixed_recall import FixedRecall |
|
import evaluate |
|
import gradio as gr |
|
import pandas as pd |
|
|
|
title = "'Combine' multiple metrics with this π€ Evaluate πͺ² Fix!" |
|
|
|
description = """<p style='text-align: center'> |
|
As I introduce myself to the entirety of the π€ ecosystem, I've put together this Space to show off a temporary fix for a current πͺ² in the π€ Evaluate library. \n |
|
|
|
Check out the original, longstanding issue [here](https://github.com/huggingface/evaluate/issues/234). This details how it is currently impossible to \ |
|
`evaluate.combine()` multiple metrics related to multilabel text classification. Particularly, one cannot `combine` the `f1`, `precision`, and `recall` scores for \ |
|
evaluation. I encountered this issue specifically while training [RoBERTa-base-DReiFT](https://huggingface.co/MarioBarbeque/RoBERTa-base-DReiFT) for multilabel \ |
|
text classification of 805 labeled medical conditions based on drug reviews. The [following workaround](https://github.com/johngrahamreynolds/FixedMetricsForHF) was |
|
created to address this. \n |
|
|
|
This Space shows how one can instantiate these custom `evaluate.Metric`s, each with their own unique methodology for averaging across labels, before `combine`-ing them into a |
|
HF `evaluate.CombinedEvaluations` object. From here, we can easily compute each of the metrics simultaneously using `compute`.</p> |
|
""" |
|
|
|
|
|
def populate_map(metric_df: pd.DataFrame, metric_set: set) -> dict: |
|
|
|
metric_map = dict() |
|
|
|
for key in metric_set: |
|
for val in metric_df.loc[metric_df["metric"] == key]["average"]: |
|
metric_map[key] = val |
|
|
|
return metric_map |
|
|
|
|
|
def evaluation(predictions_df: pd.DataFrame, metrics_df: pd.DataFrame) -> str: |
|
|
|
metric_set = set(metrics_df["Metric"].to_list()) |
|
metric_map = populate_map(metrics_df, metric_set) |
|
combined_list = [] |
|
|
|
if "f1" in metric_set: |
|
f1 = FixedF1(average=metric_map["f1"]) |
|
combined_list.append(f1) |
|
if "precision" in metric_set: |
|
precision = FixedPrecision(average=metric_map["f1"]) |
|
combined_list.append(precision) |
|
if "recall" in metric_set: |
|
recall = FixedRecall(average=metric_map["f1"]) |
|
combined_list.append(recall) |
|
|
|
combined = evaluate.combine(combined_list) |
|
|
|
predicted = [int(num) for num in predictions_df["Predicted Class Label"].to_list()] |
|
references = [int(num) for num in predictions_df["Actual Class Label"].to_list()] |
|
|
|
combined.add_batch(predictions=predicted, references=references) |
|
outputs = combined.compute() |
|
|
|
return f"Your metrics are as follows: \n {outputs}" |
|
|
|
|
|
space = gr.Interface( |
|
fn=evaluation, |
|
inputs=[ |
|
gr.Dataframe( |
|
headers=["Predicted Class Label", "Actual Class Label"], |
|
datatype=["number", "number"], |
|
row_count=5, |
|
col_count=(2, "fixed"), |
|
label="Table of Predicted vs Actual Class Labels" |
|
), |
|
gr.Dataframe( |
|
headers=["Metric", "Averaging Type"], |
|
datatype=["str", "str"], |
|
row_count=(3, "fixed"), |
|
col_count=(2, "fixed"), |
|
label="Table of Metrics and Averaging Method across Labels " |
|
) |
|
], |
|
outputs="text", |
|
title=title, |
|
description=description, |
|
examples=[ |
|
[ |
|
pd.DataFrame(columns=["Predicted Class Label", "Actual Class Label"], data=[[0,1],[1,1],[2,2],[1,0],[0,0]]), |
|
pd.DataFrame(columns=["Metric", "Averaging Type"], data=[["f1", "weighted"],["precision", "micro"],["recall", "macro"]]) |
|
] |
|
], |
|
cache_examples=False |
|
).launch() |