|
from fixed_f1 import FixedF1 |
|
from fixed_precision import FixedPrecision |
|
from fixed_recall import FixedRecall |
|
import evaluate |
|
import gradio as gr |
|
import pandas as pd |
|
|
|
title = "'Combine' multiple metrics with this π€ Evaluate πͺ² Fix!" |
|
|
|
description = """<p style='text-align: center'> |
|
As I introduce myself to the entirety of the π€ ecosystem, I've put together this Space to show off a temporary fix for a current πͺ² in the π€ Evaluate library. \n |
|
|
|
Check out the original, longstanding issue [here](https://github.com/huggingface/evaluate/issues/234). This details how it is currently impossible to \ |
|
`evaluate.combine()` multiple metrics related to multilabel text classification. Particularly, one cannot `combine` the `f1`, `precision`, and `recall` scores for \ |
|
evaluation. I encountered this issue specifically while training [RoBERTa-base-DReiFT](https://huggingface.co/MarioBarbeque/RoBERTa-base-DReiFT) for multilabel \ |
|
text classification of 805 labeled medical conditions based on drug reviews. The [following workaround](https://github.com/johngrahamreynolds/FixedMetricsForHF) was |
|
created to address this. \n |
|
|
|
This Space shows how one can instantiate these custom `evaluate.Metric`s, each with their own unique methodology for averaging across labels, before `combine`-ing them into a |
|
HF `evaluate.CombinedEvaluations` object. From here, we can easily compute each of the metrics simultaneously using `compute`.</p> |
|
""" |
|
|
|
article = """<p style='text-align: center'>Check out the [original repo](https://github.com/johngrahamreynolds/FixedMetricsForHF) housing this code, and a quickly \ |
|
trained [multilabel text classification model](https://github.com/johngrahamreynolds/RoBERTa-base-DReiFT/tree/main) that makes use of it during evaluation.</p>""" |
|
|
|
def evaluation(predictions, metrics) -> str: |
|
|
|
metric_set = set(metrics["Metric"].to_list()) |
|
combined_list = [] |
|
|
|
if "f1" in metric_set: |
|
f1 = FixedF1(average=metrics.loc[metrics["Metric"] == "f1"]["Averaging Type"][0]) |
|
combined_list.append(f1) |
|
if "precision" in metric_set: |
|
precision = FixedPrecision(average=metrics.loc[metrics["Metric"] == "precision"]["Averaging Type"][0]) |
|
combined_list.append(precision) |
|
if "recall" in metric_set: |
|
recall = FixedRecall(average=metrics.loc[metrics["Metric"] == "recall"]["Averaging Type"][0]) |
|
combined_list.append(recall) |
|
|
|
combined = evaluate.combine(combined_list) |
|
|
|
predicted = [int(num) for num in predictions["Predicted Class Label"].to_list()] |
|
references = [int(num) for num in predictions["Actual Class Label"].to_list()] |
|
|
|
combined.add_batch(predictions=predicted, references=references) |
|
outputs = combined.compute() |
|
|
|
return "Your metrics are as follows: \n" + outputs |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def filter_records(records, gender): |
|
return records[records["gender"] == gender] |
|
|
|
space = gr.Interface( |
|
fn=evaluation, |
|
inputs=[ |
|
gr.Dataframe( |
|
headers=["Predicted Class Label", "Actual Class Label"], |
|
datatype=["number", "number"], |
|
row_count=5, |
|
col_count=(2, "fixed"), |
|
label_name="Table of Predicted vs Actual Class Labels" |
|
), |
|
gr.Dataframe( |
|
headers=["Metric", "Averaging Type"], |
|
datatype=["str", "str"], |
|
row_count=(3, "fixed"), |
|
col_count=(2, "fixed"), |
|
label_name="Table of Metrics and Averaging Method across Labels " |
|
) |
|
], |
|
outputs="text", |
|
title=title, |
|
description=description, |
|
article=article, |
|
examples=[ |
|
[ |
|
[[1,1], [1,0], [2,0], [1,2], [2,2]], |
|
[["f1", "weighted"], ["precision", "micro"], ["recall", "weighted"]] |
|
] |
|
] |
|
cache_examples=False |
|
).launch() |